fix movie classifier update func

harel-coffee · Nov 15, 2019 · b93da65 · b93da65
1 parent 495e90d
commit b93da65
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 0 deletions.
diff --git a/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl b/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl
diff --git a/ch09/movieclassifier_with_update/update.py b/ch09/movieclassifier_with_update/update.py
@@ -0,0 +1,45 @@
+import pickle
+import sqlite3
+import numpy as np
+import os
+
+# import HashingVectorizer from local dir
+from vectorizer import vect
+
+
+def update_model(db_path, model, batch_size=10000):
+
+    conn = sqlite3.connect(db_path)
+    c = conn.cursor()
+    c.execute('SELECT * from review_db')
+
+    results = c.fetchmany(batch_size)
+    while results:
+        data = np.array(results)
+        X = data[:, 0]
+        y = data[:, 1].astype(int)
+
+        classes = np.array([0, 1])
+        X_train = vect.transform(X)
+        model.partial_fit(X_train, y, classes=classes)
+        results = c.fetchmany(batch_size)
+
+    conn.close()
+    return model
+
+cur_dir = os.path.dirname(__file__)
+
+clf = pickle.load(open(os.path.join(cur_dir,
+                  'pkl_objects',
+                  'classifier.pkl'), 'rb'))
+db = os.path.join(cur_dir, 'reviews.sqlite')
+
+clf = update_model(db_path=db, model=clf, batch_size=10000)
+
+# Uncomment the following lines if you are sure that
+# you want to update your classifier.pkl file
+# permanently.
+
+# pickle.dump(clf, open(os.path.join(cur_dir,
+#             'pkl_objects', 'classifier.pkl'), 'wb')
+#             , protocol=4)
diff --git a/ch09/movieclassifier_with_update/vectorizer.py b/ch09/movieclassifier_with_update/vectorizer.py
@@ -0,0 +1,24 @@
+from sklearn.feature_extraction.text import HashingVectorizer
+import re
+import os
+import pickle
+
+cur_dir = os.path.dirname(__file__)
+stop = pickle.load(open(
+                os.path.join(cur_dir,
+                'pkl_objects',
+                'stopwords.pkl'), 'rb'))
+
+def tokenizer(text):
+    text = re.sub('<[^>]*>', '', text)
+    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
+                           text.lower())
+    text = re.sub('[\W]+', ' ', text.lower()) \
+                   + ' '.join(emoticons).replace('-', '')
+    tokenized = [w for w in text.split() if w not in stop]
+    return tokenized
+
+vect = HashingVectorizer(decode_error='ignore',
+                         n_features=2**21,
+                         preprocessor=None,
+                         tokenizer=tokenizer)