Skip to content

Commit

Permalink
make out-of-core section stand-alone executable in notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Apr 25, 2018
1 parent b810a64 commit 5cbb7ca
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 47 deletions.
131 changes: 85 additions & 46 deletions code/ch08/ch08.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1327,16 +1327,51 @@
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# This cell is not contained in the book but\n",
"# added for convenience so that the notebook\n",
"# can be executed starting here, without\n",
"# executing prior code in this notebook\n",
"\n",
"import os\n",
"import gzip\n",
"\n",
"\n",
"if not os.path.isfile('movie_data.csv'):\n",
" if not os.path.isfile('movie_data.csv.gz'):\n",
" print('Please place a copy of the movie_data.csv.gz'\n",
" 'in this directory. You can obtain it by'\n",
" 'a) executing the code in the beginning of this'\n",
" 'notebook or b) by downloading it from GitHub:'\n",
" 'https://github.com/rasbt/python-machine-learning-'\n",
" 'book-2nd-edition/blob/master/code/ch08/movie_data.csv.gz')\n",
" else:\n",
" with in_f = gzip.open('movie_data.csv.gz', 'rb'), \\\n",
" out_f = open('movie_data.csv', 'wb'):\n",
" out_f.write(in_f.read())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import re\n",
"from nltk.corpus import stopwords\n",
"\n",
"\n",
"# The `stop` is defined as earlier in this chapter\n",
"# Added it here for convenience, so that this section\n",
"# can be run as standalone without executing prior code\n",
"# in the directory\n",
"stop = stopwords.words('english')\n",
"\n",
"\n",
"def tokenizer(text):\n",
" text = re.sub('<[^>]*>', '', text)\n",
" emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
Expand All @@ -1356,7 +1391,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -1366,7 +1401,7 @@
" 1)"
]
},
"execution_count": 28,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -1377,10 +1412,8 @@
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_minibatch(doc_stream, size):\n",
Expand All @@ -1397,29 +1430,18 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sebastian/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
" DeprecationWarning)\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import HashingVectorizer\n",
"from sklearn.linear_model import SGDClassifier\n",
"\n",
"\n",
"vect = HashingVectorizer(decode_error='ignore', \n",
" n_features=2**21,\n",
" preprocessor=None, \n",
" tokenizer=tokenizer)\n",
"\n",
"clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
"doc_stream = stream_docs(path='movie_data.csv')"
" tokenizer=tokenizer)"
]
},
{
Expand All @@ -1428,20 +1450,39 @@
"source": [
"**Note**\n",
"\n",
"- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19. The `n_iter` parameter is used here deriberately, because some people still use scikit-learn 0.18.\n"
"- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19."
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from distutils.version import LooseVersion as Version\n",
"from sklearn import __version__ as sklearn_version\n",
"\n",
"\n",
"if Version(sklearn_version) < '0.18':\n",
" clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
"else:\n",
" clf = SGDClassifier(loss='log', random_state=1, max_iter=1)\n",
"\n",
"\n",
"doc_stream = stream_docs(path='movie_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0% [##############################] 100% | ETA: 00:00:00\n",
"Total time elapsed: 00:00:31\n"
"Total time elapsed: 00:00:28\n"
]
}
],
Expand All @@ -1461,7 +1502,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -1480,10 +1521,8 @@
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"clf = clf.partial_fit(X_test, y_test)"
Expand Down Expand Up @@ -1792,15 +1831,15 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook ch08.ipynb to script\n",
"[NbConvertApp] Writing 24613 bytes to ch08.py\n"
"[NbConvertApp] Writing 11500 bytes to ch08.txt\n"
]
}
],
Expand All @@ -1816,17 +1855,17 @@
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
Expand Down
12 changes: 11 additions & 1 deletion code/ch08/ch08.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import LatentDirichletAllocation
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version


# Added version check for recent scikit-learn 0.18 checks


# *Python Machine Learning 2nd Edition* by [Sebastian Raschka](https://sebastianraschka.com), Packt Publishing Ltd. 2017
#
Expand Down Expand Up @@ -577,7 +583,11 @@ def get_minibatch(doc_stream, size):
preprocessor=None,
tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
if Version(sklearn_version) < '0.18':
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
else:
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

doc_stream = stream_docs(path='movie_data.csv')


Expand Down

0 comments on commit 5cbb7ca

Please sign in to comment.