From b93da650b55a70b385c3307172ed3189c858b193 Mon Sep 17 00:00:00 2001 From: rasbt Date: Fri, 15 Nov 2019 14:52:44 -0600 Subject: [PATCH] fix movie classifier update func --- .../pkl_objects/stopwords.pkl | Bin 0 -> 1310 bytes ch09/movieclassifier_with_update/update.py | 45 ++++++++++++++++++ .../movieclassifier_with_update/vectorizer.py | 24 ++++++++++ 3 files changed, 69 insertions(+) create mode 100644 ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl create mode 100644 ch09/movieclassifier_with_update/update.py create mode 100644 ch09/movieclassifier_with_update/vectorizer.py diff --git a/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl b/ch09/movieclassifier_with_update/pkl_objects/stopwords.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f0e898cd5244a1f95050f4a63b13a7b00d5a7246 GIT binary patch literal 1310 zcmXw&O_H22426SjU}m;CL}!&7WnmiAI0ZP>z)-Wx0eacFZ@wpHSV)%i^z&r%H~jlv zf3xq`^T(sg&xf0n#K-f|%pYr-2CM^@FZ=!Z2*~yMc$G@id*bXr$_^NLxr4LA``2Nb zo{w6go?J8~WU9uCX>0KId9-?NRJSqDCIPmb&q4gESdF}>@t8Cb(m<$l+PvY-n?dqM ztU+2RY-56$6xC}~rb1&H+?)Jgw8Ak)3f0G$uOrQlv5<)*lL8q$N|09SVoWKpYQk+S z(k=pag8Hc4_#-OqNHSc92wh4G30=y|4RefIogalmvb6@6#ayoe`eHlpGoiXK7OA3I zQO46B85e@S+nLQ}-Cu!a+ofyVS0@5`2TIq;h_O%YIZT*r$1pqy~z4eoX9ii)-I z7W1;|)TQmP$wOW?=3)o;vfltV{eHKpnl4d<(sdj*-IU6R9b~6+&`614xND0MxwT4CE2xYm_M9)IK6q9*a5VmG=n<6=C3?FrM zEKM(?Dsa=Q+kHI`7ke#>XjFKe5-ZkOQQX-h z6QI&0wynT3q+BCq^{y(1=jA-e7kFF3NXe#mT5+_d4Z*_{Lh92N0DC<%^oo^XS_q#7 zsFAYV%!6&_d>c2sFXusldWN$SO`B&)NxfRr@`TkE$R?r{YtK!9`P|7l5`P-5M#}2w XpHm&v^7z9Rio_|nh+;r|I@ literal 0 HcmV?d00001 diff --git a/ch09/movieclassifier_with_update/update.py b/ch09/movieclassifier_with_update/update.py new file mode 100644 index 00000000..9864685f --- /dev/null +++ b/ch09/movieclassifier_with_update/update.py @@ -0,0 +1,45 @@ +import pickle +import sqlite3 +import numpy as np +import os + +# import HashingVectorizer from local dir +from vectorizer import vect + + +def update_model(db_path, model, batch_size=10000): + + conn = sqlite3.connect(db_path) + c = conn.cursor() + c.execute('SELECT * from review_db') + + results = c.fetchmany(batch_size) + while results: + data = np.array(results) + X = data[:, 0] + y = data[:, 1].astype(int) + + classes = np.array([0, 1]) + X_train = vect.transform(X) + model.partial_fit(X_train, y, classes=classes) + results = c.fetchmany(batch_size) + + conn.close() + return model + +cur_dir = os.path.dirname(__file__) + +clf = pickle.load(open(os.path.join(cur_dir, + 'pkl_objects', + 'classifier.pkl'), 'rb')) +db = os.path.join(cur_dir, 'reviews.sqlite') + +clf = update_model(db_path=db, model=clf, batch_size=10000) + +# Uncomment the following lines if you are sure that +# you want to update your classifier.pkl file +# permanently. + +# pickle.dump(clf, open(os.path.join(cur_dir, +# 'pkl_objects', 'classifier.pkl'), 'wb') +# , protocol=4) diff --git a/ch09/movieclassifier_with_update/vectorizer.py b/ch09/movieclassifier_with_update/vectorizer.py new file mode 100644 index 00000000..00d6e745 --- /dev/null +++ b/ch09/movieclassifier_with_update/vectorizer.py @@ -0,0 +1,24 @@ +from sklearn.feature_extraction.text import HashingVectorizer +import re +import os +import pickle + +cur_dir = os.path.dirname(__file__) +stop = pickle.load(open( + os.path.join(cur_dir, + 'pkl_objects', + 'stopwords.pkl'), 'rb')) + +def tokenizer(text): + text = re.sub('<[^>]*>', '', text) + emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', + text.lower()) + text = re.sub('[\W]+', ' ', text.lower()) \ + + ' '.join(emoticons).replace('-', '') + tokenized = [w for w in text.split() if w not in stop] + return tokenized + +vect = HashingVectorizer(decode_error='ignore', + n_features=2**21, + preprocessor=None, + tokenizer=tokenizer)