make out-of-core section stand-alone executable in notebook

jdetras · Apr 25, 2018 · 5cbb7ca · 5cbb7ca
1 parent b810a64
commit 5cbb7ca
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 47 deletions.
diff --git a/code/ch08/ch08.ipynb b/code/ch08/ch08.ipynb
@@ -1327,16 +1327,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This cell is not contained in the book but\n",
+    "# added for convenience so that the notebook\n",
+    "# can be executed starting here, without\n",
+    "# executing prior code in this notebook\n",
+    "\n",
+    "import os\n",
+    "import gzip\n",
+    "\n",
+    "\n",
+    "if not os.path.isfile('movie_data.csv'):\n",
+    "    if not os.path.isfile('movie_data.csv.gz'):\n",
+    "        print('Please place a copy of the movie_data.csv.gz'\n",
+    "              'in this directory. You can obtain it by'\n",
+    "              'a) executing the code in the beginning of this'\n",
+    "              'notebook or b) by downloading it from GitHub:'\n",
+    "              'https://github.com/rasbt/python-machine-learning-'\n",
+    "              'book-2nd-edition/blob/master/code/ch08/movie_data.csv.gz')\n",
+    "    else:\n",
+    "        with in_f = gzip.open('movie_data.csv.gz', 'rb'), \\\n",
+    "                out_f = open('movie_data.csv', 'wb'):\n",
+    "            out_f.write(in_f.read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
     "import re\n",
     "from nltk.corpus import stopwords\n",
     "\n",
+    "\n",
+    "# The `stop` is defined as earlier in this chapter\n",
+    "# Added it here for convenience, so that this section\n",
+    "# can be run as standalone without executing prior code\n",
+    "# in the directory\n",
+    "stop = stopwords.words('english')\n",
+    "\n",
+    "\n",
     "def tokenizer(text):\n",
     "    text = re.sub('<[^>]*>', '', text)\n",
     "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
@@ -1356,7 +1391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -1366,7 +1401,7 @@
        " 1)"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1377,10 +1412,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 4,
+   "metadata": {},
    "outputs": [],
    "source": [
     "def get_minibatch(doc_stream, size):\n",
@@ -1397,29 +1430,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/sebastian/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
-      "  DeprecationWarning)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sklearn.feature_extraction.text import HashingVectorizer\n",
     "from sklearn.linear_model import SGDClassifier\n",
     "\n",
+    "\n",
     "vect = HashingVectorizer(decode_error='ignore', \n",
     "                         n_features=2**21,\n",
     "                         preprocessor=None, \n",
-    "                         tokenizer=tokenizer)\n",
-    "\n",
-    "clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
-    "doc_stream = stream_docs(path='movie_data.csv')"
+    "                         tokenizer=tokenizer)"
    ]
   },
   {
@@ -1428,20 +1450,39 @@
    "source": [
     "**Note**\n",
     "\n",
-    "- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19. The `n_iter` parameter is used here deriberately, because some people still use scikit-learn 0.18.\n"
+    "- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from distutils.version import LooseVersion as Version\n",
+    "from sklearn import __version__ as sklearn_version\n",
+    "\n",
+    "\n",
+    "if Version(sklearn_version) < '0.18':\n",
+    "    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
+    "else:\n",
+    "    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)\n",
+    "\n",
+    "\n",
+    "doc_stream = stream_docs(path='movie_data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "0% [##############################] 100% | ETA: 00:00:00\n",
-      "Total time elapsed: 00:00:31\n"
+      "Total time elapsed: 00:00:28\n"
      ]
     }
    ],
@@ -1461,7 +1502,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1480,10 +1521,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 9,
+   "metadata": {},
    "outputs": [],
    "source": [
     "clf = clf.partial_fit(X_test, y_test)"
@@ -1792,15 +1831,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "[NbConvertApp] Converting notebook ch08.ipynb to script\n",
-      "[NbConvertApp] Writing 24613 bytes to ch08.py\n"
+      "[NbConvertApp] Writing 11500 bytes to ch08.txt\n"
      ]
     }
    ],
@@ -1816,17 +1855,17 @@
    "language": "python",
    "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+  "toc": {
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
   }
  },
  "nbformat": 4,

diff --git a/code/ch08/ch08.py b/code/ch08/ch08.py
@@ -23,6 +23,12 @@
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.linear_model import SGDClassifier
 from sklearn.decomposition import LatentDirichletAllocation
+from distutils.version import LooseVersion as Version
+from sklearn import __version__ as sklearn_version
+
+
+# Added version check for recent scikit-learn 0.18 checks
+
 
 # *Python Machine Learning 2nd Edition* by [Sebastian Raschka](https://sebastianraschka.com), Packt Publishing Ltd. 2017
 # 
@@ -577,7 +583,11 @@ def get_minibatch(doc_stream, size):
                          preprocessor=None, 
                          tokenizer=tokenizer)
 
-clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
+if Version(sklearn_version) < '0.18':
+    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
+else:
+    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
+
 doc_stream = stream_docs(path='movie_data.csv')