Skip to content

Commit

Permalink
fix movie classifier update func
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Nov 15, 2019
1 parent 495e90d commit b93da65
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 0 deletions.
Binary file not shown.
45 changes: 45 additions & 0 deletions ch09/movieclassifier_with_update/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pickle
import sqlite3
import numpy as np
import os

# import HashingVectorizer from local dir
from vectorizer import vect


def update_model(db_path, model, batch_size=10000):

conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute('SELECT * from review_db')

results = c.fetchmany(batch_size)
while results:
data = np.array(results)
X = data[:, 0]
y = data[:, 1].astype(int)

classes = np.array([0, 1])
X_train = vect.transform(X)
model.partial_fit(X_train, y, classes=classes)
results = c.fetchmany(batch_size)

conn.close()
return model

cur_dir = os.path.dirname(__file__)

clf = pickle.load(open(os.path.join(cur_dir,
'pkl_objects',
'classifier.pkl'), 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')

clf = update_model(db_path=db, model=clf, batch_size=10000)

# Uncomment the following lines if you are sure that
# you want to update your classifier.pkl file
# permanently.

# pickle.dump(clf, open(os.path.join(cur_dir,
# 'pkl_objects', 'classifier.pkl'), 'wb')
# , protocol=4)
24 changes: 24 additions & 0 deletions ch09/movieclassifier_with_update/vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
os.path.join(cur_dir,
'pkl_objects',
'stopwords.pkl'), 'rb'))

def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text.lower())
text = re.sub('[\W]+', ' ', text.lower()) \
+ ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized

vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)

0 comments on commit b93da65

Please sign in to comment.