diff --git a/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl b/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl new file mode 100644 index 00000000..f0e898cd Binary files /dev/null and b/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl differ diff --git a/ch09/movieclassifier_with_update/update.py b/ch09/movieclassifier_with_update/update.py new file mode 100644 index 00000000..9864685f --- /dev/null +++ b/ch09/movieclassifier_with_update/update.py @@ -0,0 +1,45 @@ +import pickle +import sqlite3 +import numpy as np +import os + +# import HashingVectorizer from local dir +from vectorizer import vect + + +def update_model(db_path, model, batch_size=10000): + + conn = sqlite3.connect(db_path) + c = conn.cursor() + c.execute('SELECT * from review_db') + + results = c.fetchmany(batch_size) + while results: + data = np.array(results) + X = data[:, 0] + y = data[:, 1].astype(int) + + classes = np.array([0, 1]) + X_train = vect.transform(X) + model.partial_fit(X_train, y, classes=classes) + results = c.fetchmany(batch_size) + + conn.close() + return model + +cur_dir = os.path.dirname(__file__) + +clf = pickle.load(open(os.path.join(cur_dir, + 'pkl_objects', + 'classifier.pkl'), 'rb')) +db = os.path.join(cur_dir, 'reviews.sqlite') + +clf = update_model(db_path=db, model=clf, batch_size=10000) + +# Uncomment the following lines if you are sure that +# you want to update your classifier.pkl file +# permanently. + +# pickle.dump(clf, open(os.path.join(cur_dir, +# 'pkl_objects', 'classifier.pkl'), 'wb') +# , protocol=4) diff --git a/ch09/movieclassifier_with_update/vectorizer.py b/ch09/movieclassifier_with_update/vectorizer.py new file mode 100644 index 00000000..00d6e745 --- /dev/null +++ b/ch09/movieclassifier_with_update/vectorizer.py @@ -0,0 +1,24 @@ +from sklearn.feature_extraction.text import HashingVectorizer +import re +import os +import pickle + +cur_dir = os.path.dirname(__file__) +stop = pickle.load(open( + os.path.join(cur_dir, + 'pkl_objects', + 'stopwords.pkl'), 'rb')) + +def tokenizer(text): + text = re.sub('<[^>]*>', '', text) + emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', + text.lower()) + text = re.sub('[\W]+', ' ', text.lower()) \ + + ' '.join(emoticons).replace('-', '') + tokenized = [w for w in text.split() if w not in stop] + return tokenized + +vect = HashingVectorizer(decode_error='ignore', + n_features=2**21, + preprocessor=None, + tokenizer=tokenizer)