From 005dbae7696204d83334f167140d711bcc16d921 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sun, 17 Nov 2019 00:23:04 -0600 Subject: [PATCH] address ch08 changes --- ch08/ch08.ipynb | 215 +++++++++++++++++------------------------------- ch08/ch08.py | 55 ++++--------- 2 files changed, 88 insertions(+), 182 deletions(-) diff --git a/ch08/ch08.ipynb b/ch08/ch08.ipynb index b75dc7ec..23ee611d 100644 --- a/ch08/ch08.ipynb +++ b/ch08/ch08.ipynb @@ -42,15 +42,15 @@ "output_type": "stream", "text": [ "Sebastian Raschka \n", - "last updated: 2019-06-14 \n", + "last updated: 2019-11-16 \n", "\n", - "CPython 3.7.3\n", - "IPython 7.5.0\n", + "CPython 3.7.1\n", + "IPython 7.9.0\n", "\n", - "numpy 1.16.4\n", + "numpy 1.17.2\n", "pandas 0.24.2\n", - "sklearn 0.21.1\n", - "nltk 3.4.1\n" + "sklearn 0.21.3\n", + "nltk not installed\n" ] } ], @@ -161,7 +161,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "100% | 80 MB | 0.61 MB/s | 131 sec elapsed" + "100% | 80 MB | 0.74 MB/s | 108 sec elapsed" ] } ], @@ -225,7 +225,7 @@ "output_type": "stream", "text": [ "0% [##############################] 100% | ETA: 00:00:00\n", - "Total time elapsed: 00:02:05\n" + "Total time elapsed: 00:02:06\n" ] } ], @@ -435,9 +435,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -525,9 +523,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "np.set_printoptions(precision=2)" @@ -611,13 +607,13 @@ "To make sure that we understand how TfidfTransformer works, let us walk\n", "through an example and calculate the tf-idf of the word is in the 3rd document.\n", "\n", - "The word is has a term frequency of 3 (tf = 3) in document 3, and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows:\n", + "The word is has a term frequency of 3 (tf = 3) in document 3 ($d_3$), and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows:\n", "\n", - "$$\\text{idf}(\"is\", d3) = log \\frac{1+3}{1+3} = 0$$\n", + "$$\\text{idf}(\"is\", d_3) = log \\frac{1+3}{1+3} = 0$$\n", "\n", "Now in order to calculate the tf-idf, we simply need to add 1 to the inverse document frequency and multiply it by the term frequency:\n", "\n", - "$$\\text{tf-idf}(\"is\",d3)= 3 \\times (0+1) = 3$$" + "$$\\text{tf-idf}(\"is\", d_3)= 3 \\times (0+1) = 3$$" ] }, { @@ -746,9 +742,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import re\n", @@ -804,9 +798,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "df['review'] = df['review'].apply(preprocessor)" @@ -828,10 +820,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "from nltk.stem.porter import PorterStemmer\n", @@ -848,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -857,7 +847,7 @@ "['runners', 'like', 'running', 'and', 'thus', 'they', 'run']" ] }, - "execution_count": 23, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -868,7 +858,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -877,7 +867,7 @@ "['runner', 'like', 'run', 'and', 'thu', 'they', 'run']" ] }, - "execution_count": 24, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -888,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -897,7 +887,7 @@ "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/sebastian/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" + "[nltk_data] Unzipping corpora/stopwords.zip.\n" ] }, { @@ -906,7 +896,7 @@ "True" ] }, - "execution_count": 25, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -919,7 +909,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -928,7 +918,7 @@ "['runner', 'like', 'run', 'run', 'lot']" ] }, - "execution_count": 26, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -965,10 +955,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "X_train = df.loc[:25000, 'review'].values\n", @@ -979,10 +967,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -1009,7 +995,7 @@ " ]\n", "\n", "lr_tfidf = Pipeline([('vect', tfidf),\n", - " ('clf', LogisticRegression(random_state=0, solver='lbfgs'))])\n", + " ('clf', LogisticRegression(random_state=0, solver='liblinear'))])\n", "\n", "gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,\n", " scoring='accuracy',\n", @@ -1033,7 +1019,7 @@ "source": [ "**Important Note about the running time**\n", "\n", - "Executing the following code cell **may take up to 30-60 min** depending on your machine, since based on the parameter grid we defined, there are 2*2*2*3*5 + 2*2*2*3*5 = 240 models to fit.\n", + "Executing the following code cell **may take up to 30-60 min** depending on your machine, since based on the parameter grid we defined, there are `2*2*2*3*5` + `2*2*2*3*5` = 240 models to fit.\n", "\n", "If you do not wish to wait so long, you could reduce the size of the dataset by decreasing the number of training examples, for example, as follows:\n", "\n", @@ -1052,32 +1038,7 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "## @Readers: PLEASE IGNORE THIS CELL\n", - "##\n", - "## This cell is meant to generate more \n", - "## \"logging\" output when this notebook is run \n", - "## on the Travis Continuous Integration\n", - "## platform to test the code as well as\n", - "## speeding up the run using a smaller\n", - "## dataset for debugging\n", - "\n", - "if 'TRAVIS' in os.environ:\n", - " gs_lr_tfidf.verbose=2\n", - " X_train = df.loc[:250, 'review'].values\n", - " y_train = df.loc[:250, 'sentiment'].values\n", - " X_test = df.loc[25000:25250, 'review'].values\n", - " y_test = df.loc[25000:25250, 'sentiment'].values" - ] - }, - { - "cell_type": "code", - "execution_count": 30, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -1092,11 +1053,9 @@ "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 17 tasks | elapsed: 4.2min\n", - "[Parallel(n_jobs=-1)]: Done 138 tasks | elapsed: 25.7min\n", - "[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 44.2min finished\n", - "/Users/sebastian/miniconda3/envs/pymle3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", - " FutureWarning)\n" + "[Parallel(n_jobs=-1)]: Done 17 tasks | elapsed: 4.0min\n", + "[Parallel(n_jobs=-1)]: Done 138 tasks | elapsed: 22.6min\n", + "[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 39.3min finished\n" ] }, { @@ -1128,14 +1087,14 @@ " 'herself', 'it', \"it's\", 'its',\n", " 'itself', ...],\n", " None],\n", - " 'vect__tokenizer': [,\n", - " ],\n", + " 'vect__tokenizer': [,\n", + " ],\n", " 'vect__use_idf': [False]}],\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", " scoring='accuracy', verbose=2)" ] }, - "execution_count": 30, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1146,14 +1105,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': } \n", + "Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': } \n", "CV Accuracy: 0.897\n" ] } @@ -1165,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1200,7 +1159,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -1209,7 +1168,7 @@ "array([0.4, 0.2, 0.6, 0.2, 0.4])" ] }, - "execution_count": 33, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1229,7 +1188,7 @@ "cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False, random_state=0).split(X, y))\n", " \n", "lr = LogisticRegression(random_state=123, multi_class='ovr', solver='lbfgs')\n", - "cross_val_score(lr, X, y, cv=cv5_idx)\n" + "cross_val_score(lr, X, y, cv=cv5_idx)" ] }, { @@ -1243,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -1297,7 +1256,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -1306,7 +1265,7 @@ "0.36" ] }, - "execution_count": 35, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1324,7 +1283,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -1333,7 +1292,7 @@ "0.36000000000000004" ] }, - "execution_count": 36, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1370,7 +1329,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1399,7 +1358,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1434,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -1444,7 +1403,7 @@ " 1)" ] }, - "execution_count": 39, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -1455,7 +1414,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1473,7 +1432,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -1489,7 +1448,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1504,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1512,7 +1471,7 @@ "output_type": "stream", "text": [ "0% [##############################] 100% | ETA: 00:00:00\n", - "Total time elapsed: 00:00:20\n" + "Total time elapsed: 00:00:18\n" ] } ], @@ -1532,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1551,7 +1510,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -1581,7 +1540,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -1636,7 +1595,7 @@ "2 ***SPOILER*** Do not read this, if you think a... 0" ] }, - "execution_count": 46, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1650,32 +1609,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "## @Readers: PLEASE IGNORE THIS CELL\n", - "##\n", - "## This cell is meant to create a smaller dataset if\n", - "## the notebook is run on the Travis Continuous Integration\n", - "## platform to test the code on a smaller dataset\n", - "## to prevent timeout errors and just serves a debugging tool\n", - "## for this notebook\n", - "\n", - "if 'TRAVIS' in os.environ:\n", - " df.loc[:500].to_csv('movie_data.csv')\n", - " df = pd.read_csv('movie_data.csv', nrows=500)\n", - " print('SMALL DATA SUBSET CREATED FOR TESTING')" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", @@ -1688,10 +1623,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": true - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import LatentDirichletAllocation\n", @@ -1704,7 +1637,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -1713,7 +1646,7 @@ "(10, 5000)" ] }, - "execution_count": 50, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1724,7 +1657,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -1792,7 +1725,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 58, "metadata": { "scrolled": true }, @@ -1861,7 +1794,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -1869,7 +1802,7 @@ "output_type": "stream", "text": [ "[NbConvertApp] Converting notebook ch08.ipynb to script\n", - "[NbConvertApp] Writing 25638 bytes to ch08.py\n" + "[NbConvertApp] Writing 24745 bytes to ch08.py\n" ] } ], @@ -1911,5 +1844,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/ch08/ch08.py b/ch08/ch08.py index 04977b7b..0e385552 100644 --- a/ch08/ch08.py +++ b/ch08/ch08.py @@ -44,7 +44,15 @@ -# *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* +# *The use of `watermark` is optional. You can install this Jupyter extension via* +# +# conda install watermark -c conda-forge +# +# or +# +# pip install watermark +# +# *For more information, please see: https://github.com/rasbt/watermark.* # ### Overview @@ -271,13 +279,13 @@ def reporthook(count, block_size, total_size): # To make sure that we understand how TfidfTransformer works, let us walk # through an example and calculate the tf-idf of the word is in the 3rd document. # -# The word is has a term frequency of 3 (tf = 3) in document 3, and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows: +# The word is has a term frequency of 3 (tf = 3) in document 3 ($d_3$), and the document frequency of this term is 3 since the term is occurs in all three documents (df = 3). Thus, we can calculate the idf as follows: # -# $$\text{idf}("is", d3) = log \frac{1+3}{1+3} = 0$$ +# $$\text{idf}("is", d_3) = log \frac{1+3}{1+3} = 0$$ # # Now in order to calculate the tf-idf, we simply need to add 1 to the inverse document frequency and multiply it by the term frequency: # -# $$\text{tf-idf}("is",d3)= 3 \times (0+1) = 3$$ +# $$\text{tf-idf}("is", d_3)= 3 \times (0+1) = 3$$ @@ -420,7 +428,7 @@ def tokenizer_porter(text): ] lr_tfidf = Pipeline([('vect', tfidf), - ('clf', LogisticRegression(random_state=0))]) + ('clf', LogisticRegression(random_state=0, solver='liblinear'))]) gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', @@ -435,7 +443,7 @@ def tokenizer_porter(text): # **Important Note about the running time** # -# Executing the following code cell **may take up to 30-60 min** depending on your machine, since based on the parameter grid we defined, there are 2*2*2*3*5 + 2*2*2*3*5 = 240 models to fit. +# Executing the following code cell **may take up to 30-60 min** depending on your machine, since based on the parameter grid we defined, there are `2*2*2*3*5` + `2*2*2*3*5` = 240 models to fit. # # If you do not wish to wait so long, you could reduce the size of the dataset by decreasing the number of training examples, for example, as follows: # @@ -453,25 +461,6 @@ def tokenizer_porter(text): -## @Readers: PLEASE IGNORE THIS CELL -## -## This cell is meant to generate more -## "logging" output when this notebook is run -## on the Travis Continuous Integration -## platform to test the code as well as -## speeding up the run using a smaller -## dataset for debugging - -if 'TRAVIS' in os.environ: - gs_lr_tfidf.verbose=2 - X_train = df.loc[:250, 'review'].values - y_train = df.loc[:250, 'sentiment'].values - X_test = df.loc[25000:25250, 'review'].values - y_test = df.loc[25000:25250, 'sentiment'].values - - - - gs_lr_tfidf.fit(X_train, y_train) @@ -669,22 +658,6 @@ def get_minibatch(doc_stream, size): -## @Readers: PLEASE IGNORE THIS CELL -## -## This cell is meant to create a smaller dataset if -## the notebook is run on the Travis Continuous Integration -## platform to test the code on a smaller dataset -## to prevent timeout errors and just serves a debugging tool -## for this notebook - -if 'TRAVIS' in os.environ: - df.loc[:500].to_csv('movie_data.csv') - df = pd.read_csv('movie_data.csv', nrows=500) - print('SMALL DATA SUBSET CREATED FOR TESTING') - - - - count = CountVectorizer(stop_words='english', max_df=.1,