Skip to content

Commit

Permalink
Merge pull request bfelbo#18 from rht/pep8
Browse files Browse the repository at this point in the history
Lint the code according to PEP8
  • Loading branch information
Bjarke Felbo authored Dec 21, 2017
2 parents 2700754 + 1c8f5b3 commit abeb89a
Show file tree
Hide file tree
Showing 27 changed files with 176 additions and 132 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ matrix:
- virtualenv env
- source env/bin/activate
install:
- pip install nose tensorflow==1.4.0
- pip install flake8 nose tensorflow==1.4.0
- pip install -e .
script:
- flake8 deepmoji examples scripts tests --ignore=F403,E501,E123,E128,E402,F401,F841,F811
- python scripts/download_weights.py
- cd tests
- nosetests -v
1 change: 1 addition & 0 deletions deepmoji/attlayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from keras.engine import InputSpec, Layer
from keras import backend as K


class AttentionWeightedAverage(Layer):
"""
Computes a weighted average of the different channels across timesteps.
Expand Down
18 changes: 10 additions & 8 deletions deepmoji/class_avg_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
train_by_chain_thaw,
find_f1_threshold)


def relabel(y, current_label_nr, nb_classes):
""" Makes a binary classification for a specific class in a
multi-class dataset.
Expand All @@ -39,7 +40,7 @@ def relabel(y, current_label_nr, nb_classes):
return y

y_new = np.zeros(len(y))
y_cut = y[:,current_label_nr]
y_cut = y[:, current_label_nr]
label_pos = np.where(y_cut == 1)[0]
y_new[label_pos] = 1
return y_new
Expand Down Expand Up @@ -155,13 +156,14 @@ def prepare_labels(y_train, y_val, y_test, iter_i, nb_classes):
y_test_new = relabel(y_test, iter_i, nb_classes)
return y_train_new, y_val_new, y_test_new


def prepare_generators(X_train, y_train_new, X_val, y_val_new, batch_size, epoch_size):
# Create sample generators
# Make a fixed validation set to avoid fluctuations in validation
train_gen = sampling_generator(X_train, y_train_new, batch_size,
upsample=False)
upsample=False)
val_gen = sampling_generator(X_val, y_val_new,
epoch_size, upsample=False)
epoch_size, upsample=False)
X_val_resamp, y_val_resamp = next(val_gen)
return train_gen, X_val_resamp, y_val_resamp

Expand Down Expand Up @@ -203,7 +205,7 @@ def class_avg_tune_trainable(model, nb_classes, train, val, test, epoch_size,
model.save_weights(init_weight_path)
for i in range(nb_iter):
if verbose:
print('Iteration number {}/{}'.format(i+1, nb_iter))
print('Iteration number {}/{}'.format(i + 1, nb_iter))

model.load_weights(init_weight_path, by_name=False)
y_train_new, y_val_new, y_test_new = prepare_labels(y_train, y_val,
Expand All @@ -215,7 +217,7 @@ def class_avg_tune_trainable(model, nb_classes, train, val, test, epoch_size,
if verbose:
print("Training..")
callbacks = finetuning_callbacks(checkpoint_weight_path, patience)
steps = int(epoch_size/batch_size)
steps = int(epoch_size / batch_size)
model.fit_generator(train_gen, steps_per_epoch=steps,
max_q_size=2, epochs=nb_epochs,
validation_data=(X_val_resamp, y_val_resamp),
Expand Down Expand Up @@ -287,14 +289,14 @@ def class_avg_chainthaw(model, nb_classes, train, val, test, batch_size,

for i in range(nb_iter):
if verbose:
print('Iteration number {}/{}'.format(i+1, nb_iter))
print('Iteration number {}/{}'.format(i + 1, nb_iter))

model.load_weights(f1_init_weight_path, by_name=False)
y_train_new, y_val_new, y_test_new = prepare_labels(y_train, y_val,
y_test, i, nb_classes)
train_gen, X_val_resamp, y_val_resamp = \
prepare_generators(X_train, y_train_new, X_val, y_val_new,
batch_size, epoch_size)
prepare_generators(X_train, y_train_new, X_val, y_val_new,
batch_size, epoch_size)

if verbose:
print("Training..")
Expand Down
22 changes: 13 additions & 9 deletions deepmoji/create_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@
from global_variables import SPECIAL_TOKENS, VOCAB_PATH
from copy import deepcopy


class VocabBuilder():
""" Create vocabulary with words extracted from sentences as fed from a
word generator.
"""

def __init__(self, word_gen):
# initialize any new key with value of 0
self.word_counts = defaultdict(lambda: 0, {})
self.word_length_limit=30
self.word_length_limit = 30

for token in SPECIAL_TOKENS:
assert len(token) < self.word_length_limit
Expand All @@ -44,7 +46,7 @@ def save_vocab(self, path=None):
path: Where the vocabulary should be saved. If not specified, a
randomly generated filename is used instead.
"""
dtype = ([('word','|S{}'.format(self.word_length_limit)),('count','int')])
dtype = ([('word', '|S{}'.format(self.word_length_limit)), ('count', 'int')])
np_dict = np.array(self.word_counts.items(), dtype=dtype)

# sort from highest to lowest frequency
Expand Down Expand Up @@ -72,9 +74,11 @@ def count_all_words(self):
for words, _ in self.word_gen:
self.count_words_in_sentence(words)


class MasterVocab():
""" Combines vocabularies.
"""

def __init__(self):

# initialize custom tokens
Expand Down Expand Up @@ -144,8 +148,8 @@ def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None):
force_word_count = force_appearance_vocab[word]
except KeyError:
continue
#if force_word_count < 5:
#continue
# if force_word_count < 5:
# continue

if word in self.master_vocab:
self.master_vocab[word] += normalized_count
Expand All @@ -168,13 +172,13 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):

# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
key=lambda kv: kv[1], reverse=True))
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)

# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
dtype=([('word','|S30'),('count','float')]))
dtype=([('word', '|S30'), ('count', 'float')]))

# output count for debugging
counts = np_vocab[:word_limit]
Expand All @@ -183,7 +187,7 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
final_words.update({w:i})
final_words.update({w: i})
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))

Expand Down Expand Up @@ -231,7 +235,7 @@ def extend_vocab_in_file(vocab, max_tokens=10000, vocab_path=VOCAB_PATH):

# Save back to file
with open(vocab_path, 'w') as f:
json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',',': '))
json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',', ': '))


def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
Expand All @@ -254,7 +258,7 @@ def extend_vocab(current_vocab, new_vocab, max_tokens=10000):

# sort words by frequency
desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
key=lambda kv: kv[1], reverse=True))
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)

base_index = len(current_vocab.keys())
Expand Down
3 changes: 3 additions & 0 deletions deepmoji/filter_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from emoji import UNICODE_EMOJI


def read_english(path="english_words.txt", add_emojis=True):
# read english words for filtering (includes emojis as part of set)
english = set()
Expand All @@ -17,6 +18,7 @@ def read_english(path="english_words.txt", add_emojis=True):
english.add(e)
return english


def read_wanted_emojis(path="wanted_emojis.csv"):
emojis = []
with open(path, 'rb') as f:
Expand All @@ -27,6 +29,7 @@ def read_wanted_emojis(path="wanted_emojis.csv"):
emojis.append(line)
return emojis


def read_non_english_users(path="unwanted_users.npz"):
try:
neu_set = set(np.load(path)['userids'])
Expand Down
51 changes: 33 additions & 18 deletions deepmoji/filter_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,29 @@
urlRegex = re.compile(RE_URL)

# from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
VARIATION_SELECTORS = [ u'\ufe00',
u'\ufe01',
u'\ufe02',
u'\ufe03',
u'\ufe04',
u'\ufe05',
u'\ufe06',
u'\ufe07',
u'\ufe08',
u'\ufe09',
u'\ufe0a',
u'\ufe0b',
u'\ufe0c',
u'\ufe0d',
u'\ufe0e',
u'\ufe0f']
VARIATION_SELECTORS = [u'\ufe00',
u'\ufe01',
u'\ufe02',
u'\ufe03',
u'\ufe04',
u'\ufe05',
u'\ufe06',
u'\ufe07',
u'\ufe08',
u'\ufe09',
u'\ufe0a',
u'\ufe0b',
u'\ufe0c',
u'\ufe0d',
u'\ufe0e',
u'\ufe0f']

# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
ALL_CHARS = (unichr(i) for i in xrange(sys.maxunicode))
CONTROL_CHARS = ''.join(map(unichr, range(0,32) + range(127,160)))
CONTROL_CHARS = ''.join(map(unichr, range(0, 32) + range(127, 160)))
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))


def is_special_token(word):
equal = False
for spec in SPECIAL_TOKENS:
Expand All @@ -43,6 +44,7 @@ def is_special_token(word):
break
return equal


def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
""" Ensure text meets threshold for containing English words """

Expand Down Expand Up @@ -71,6 +73,7 @@ def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_s
valid_english = n_english >= n_words * pct_eng_long
return valid_english, n_words, n_english


def correct_length(words, min_words, max_words, ignore_special_tokens=True):
""" Ensure text meets threshold for containing English words
and that it's within the min and max words limits. """
Expand All @@ -91,17 +94,21 @@ def correct_length(words, min_words, max_words, ignore_special_tokens=True):
valid = min_words <= n_words and n_words <= max_words
return valid


def punct_word(word, punctuation=string.punctuation):
return all([True if c in punctuation else False for c in word])


def load_non_english_user_set():
non_english_user_set = set(np.load('uids.npz')['data'])
return non_english_user_set


def non_english_user(userid, non_english_user_set):
neu_found = int(userid) in non_english_user_set
return neu_found


def separate_emojis_and_text(text):
emoji_chars = []
non_emoji_chars = []
Expand All @@ -112,10 +119,12 @@ def separate_emojis_and_text(text):
non_emoji_chars.append(c)
return ''.join(emoji_chars), ''.join(non_emoji_chars)


def extract_emojis(text, wanted_emojis):
text = remove_variation_selectors(text)
return [c for c in text if c in wanted_emojis]


def remove_variation_selectors(text):
""" Remove styling glyph variants for Unicode characters.
For instance, remove skin color from emojis.
Expand All @@ -124,6 +133,7 @@ def remove_variation_selectors(text):
text = text.replace(var, u'')
return text


def shorten_word(word):
""" Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
"""
Expand All @@ -147,10 +157,11 @@ def shorten_word(word):
# replace letters to find the short word
short_word = word
for trip in triple_or_more:
short_word = short_word.replace(trip, trip[0]*2)
short_word = short_word.replace(trip, trip[0] * 2)

return short_word


def detect_special_tokens(word):
try:
int(word)
Expand All @@ -162,22 +173,26 @@ def detect_special_tokens(word):
word = SPECIAL_TOKENS[3]
return word


def process_word(word):
""" Shortening and converting the word to a special token if relevant.
"""
word = shorten_word(word)
word = detect_special_tokens(word)
return word


def remove_control_chars(text):
return CONTROL_CHAR_REGEX.sub('', text)


def convert_nonbreaking_space(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
for r in [u'\\\\xc2', u'\\xc2', u'\xc2', u'\\\\xa0', u'\\xa0', u'\xa0']:
text = text.replace(r, u' ')
return text


def convert_linebreaks(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
# space around to ensure proper tokenization
Expand Down
Loading

0 comments on commit abeb89a

Please sign in to comment.