forked from rasbt/python-machine-learning-book-3rd-edition
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvectorizer.py
24 lines (21 loc) · 797 Bytes
/
vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
os.path.join(cur_dir,
'pkl_objects',
'stopwords.pkl'), 'rb'))
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
text.lower())
text = re.sub('[\W]+', ' ', text.lower()) \
+ ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)