Merge pull request bfelbo#18 from rht/pep8

Lint the code according to PEP8
jkhlot · Dec 21, 2017 · abeb89a · abeb89a
2 parents 2700754 + 1c8f5b3
commit abeb89a
Show file tree

Hide file tree

Showing 27 changed files with 176 additions and 132 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -14,9 +14,10 @@ matrix:
       - virtualenv env
       - source env/bin/activate
 install:
-  - pip install nose tensorflow==1.4.0
+  - pip install flake8 nose tensorflow==1.4.0
   - pip install -e .
 script:
+  - flake8 deepmoji examples scripts tests --ignore=F403,E501,E123,E128,E402,F401,F841,F811
   - python scripts/download_weights.py
   - cd tests
   - nosetests -v
diff --git a/deepmoji/attlayer.py b/deepmoji/attlayer.py
@@ -8,6 +8,7 @@
 from keras.engine import InputSpec, Layer
 from keras import backend as K
 
+
 class AttentionWeightedAverage(Layer):
     """
     Computes a weighted average of the different channels across timesteps.

diff --git a/deepmoji/class_avg_finetuning.py b/deepmoji/class_avg_finetuning.py
@@ -20,6 +20,7 @@
     train_by_chain_thaw,
     find_f1_threshold)
 
+
 def relabel(y, current_label_nr, nb_classes):
     """ Makes a binary classification for a specific class in a
         multi-class dataset.
@@ -39,7 +40,7 @@ def relabel(y, current_label_nr, nb_classes):
         return y
 
     y_new = np.zeros(len(y))
-    y_cut = y[:,current_label_nr]
+    y_cut = y[:, current_label_nr]
     label_pos = np.where(y_cut == 1)[0]
     y_new[label_pos] = 1
     return y_new
@@ -155,13 +156,14 @@ def prepare_labels(y_train, y_val, y_test, iter_i, nb_classes):
     y_test_new = relabel(y_test, iter_i, nb_classes)
     return y_train_new, y_val_new, y_test_new
 
+
 def prepare_generators(X_train, y_train_new, X_val, y_val_new, batch_size, epoch_size):
     # Create sample generators
     # Make a fixed validation set to avoid fluctuations in validation
     train_gen = sampling_generator(X_train, y_train_new, batch_size,
-                                       upsample=False)
+                                   upsample=False)
     val_gen = sampling_generator(X_val, y_val_new,
-                                     epoch_size, upsample=False)
+                                 epoch_size, upsample=False)
     X_val_resamp, y_val_resamp = next(val_gen)
     return train_gen, X_val_resamp, y_val_resamp
 
@@ -203,7 +205,7 @@ def class_avg_tune_trainable(model, nb_classes, train, val, test, epoch_size,
     model.save_weights(init_weight_path)
     for i in range(nb_iter):
         if verbose:
-            print('Iteration number {}/{}'.format(i+1, nb_iter))
+            print('Iteration number {}/{}'.format(i + 1, nb_iter))
 
         model.load_weights(init_weight_path, by_name=False)
         y_train_new, y_val_new, y_test_new = prepare_labels(y_train, y_val,
@@ -215,7 +217,7 @@ def class_avg_tune_trainable(model, nb_classes, train, val, test, epoch_size,
         if verbose:
             print("Training..")
         callbacks = finetuning_callbacks(checkpoint_weight_path, patience)
-        steps = int(epoch_size/batch_size)
+        steps = int(epoch_size / batch_size)
         model.fit_generator(train_gen, steps_per_epoch=steps,
                             max_q_size=2, epochs=nb_epochs,
                             validation_data=(X_val_resamp, y_val_resamp),
@@ -287,14 +289,14 @@ def class_avg_chainthaw(model, nb_classes, train, val, test, batch_size,
 
     for i in range(nb_iter):
         if verbose:
-            print('Iteration number {}/{}'.format(i+1, nb_iter))
+            print('Iteration number {}/{}'.format(i + 1, nb_iter))
 
         model.load_weights(f1_init_weight_path, by_name=False)
         y_train_new, y_val_new, y_test_new = prepare_labels(y_train, y_val,
                                                             y_test, i, nb_classes)
         train_gen, X_val_resamp, y_val_resamp = \
-                prepare_generators(X_train, y_train_new, X_val, y_val_new,
-                                   batch_size, epoch_size)
+            prepare_generators(X_train, y_train_new, X_val, y_val_new,
+                               batch_size, epoch_size)
 
         if verbose:
             print("Training..")

diff --git a/deepmoji/create_vocab.py b/deepmoji/create_vocab.py
@@ -10,14 +10,16 @@
 from global_variables import SPECIAL_TOKENS, VOCAB_PATH
 from copy import deepcopy
 
+
 class VocabBuilder():
     """ Create vocabulary with words extracted from sentences as fed from a
         word generator.
     """
+
     def __init__(self, word_gen):
         # initialize any new key with value of 0
         self.word_counts = defaultdict(lambda: 0, {})
-        self.word_length_limit=30
+        self.word_length_limit = 30
 
         for token in SPECIAL_TOKENS:
             assert len(token) < self.word_length_limit
@@ -44,7 +46,7 @@ def save_vocab(self, path=None):
             path: Where the vocabulary should be saved. If not specified, a
                   randomly generated filename is used instead.
         """
-        dtype = ([('word','|S{}'.format(self.word_length_limit)),('count','int')])
+        dtype = ([('word', '|S{}'.format(self.word_length_limit)), ('count', 'int')])
         np_dict = np.array(self.word_counts.items(), dtype=dtype)
 
         # sort from highest to lowest frequency
@@ -72,9 +74,11 @@ def count_all_words(self):
         for words, _ in self.word_gen:
             self.count_words_in_sentence(words)
 
+
 class MasterVocab():
     """ Combines vocabularies.
     """
+
     def __init__(self):
 
         # initialize custom tokens
@@ -144,8 +148,8 @@ def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None):
                         force_word_count = force_appearance_vocab[word]
                     except KeyError:
                         continue
-                    #if force_word_count < 5:
-                        #continue
+                    # if force_word_count < 5:
+                        # continue
 
                 if word in self.master_vocab:
                     self.master_vocab[word] += normalized_count
@@ -168,13 +172,13 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
 
         # sort words by frequency
         desc_order = OrderedDict(sorted(self.master_vocab.items(),
-                                 key=lambda kv: kv[1], reverse=True))
+                                        key=lambda kv: kv[1], reverse=True))
         words.update(desc_order)
 
         # use encoding of up to 30 characters (no token conversions)
         # use float to store large numbers (we don't care about precision loss)
         np_vocab = np.array(words.items(),
-                            dtype=([('word','|S30'),('count','float')]))
+                            dtype=([('word', '|S30'), ('count', 'float')]))
 
         # output count for debugging
         counts = np_vocab[:word_limit]
@@ -183,7 +187,7 @@ def save_vocab(self, path_count, path_vocab, word_limit=100000):
         # output the index of each word for easy lookup
         final_words = OrderedDict()
         for i, w in enumerate(words.keys()[:word_limit]):
-            final_words.update({w:i})
+            final_words.update({w: i})
         with open(path_vocab, 'w') as f:
             f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
 
@@ -231,7 +235,7 @@ def extend_vocab_in_file(vocab, max_tokens=10000, vocab_path=VOCAB_PATH):
 
     # Save back to file
     with open(vocab_path, 'w') as f:
-        json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',',': '))
+        json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',', ': '))
 
 
 def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
@@ -254,7 +258,7 @@ def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
 
     # sort words by frequency
     desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
-                                key=lambda kv: kv[1], reverse=True))
+                                    key=lambda kv: kv[1], reverse=True))
     words.update(desc_order)
 
     base_index = len(current_vocab.keys())

diff --git a/deepmoji/filter_input.py b/deepmoji/filter_input.py
@@ -4,6 +4,7 @@
 import numpy as np
 from emoji import UNICODE_EMOJI
 
+
 def read_english(path="english_words.txt", add_emojis=True):
     # read english words for filtering (includes emojis as part of set)
     english = set()
@@ -17,6 +18,7 @@ def read_english(path="english_words.txt", add_emojis=True):
             english.add(e)
     return english
 
+
 def read_wanted_emojis(path="wanted_emojis.csv"):
     emojis = []
     with open(path, 'rb') as f:
@@ -27,6 +29,7 @@ def read_wanted_emojis(path="wanted_emojis.csv"):
             emojis.append(line)
     return emojis
 
+
 def read_non_english_users(path="unwanted_users.npz"):
     try:
         neu_set = set(np.load(path)['userids'])

diff --git a/deepmoji/filter_utils.py b/deepmoji/filter_utils.py
@@ -13,28 +13,29 @@
 urlRegex = re.compile(RE_URL)
 
 # from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
-VARIATION_SELECTORS = [ u'\ufe00',
-                        u'\ufe01',
-                        u'\ufe02',
-                        u'\ufe03',
-                        u'\ufe04',
-                        u'\ufe05',
-                        u'\ufe06',
-                        u'\ufe07',
-                        u'\ufe08',
-                        u'\ufe09',
-                        u'\ufe0a',
-                        u'\ufe0b',
-                        u'\ufe0c',
-                        u'\ufe0d',
-                        u'\ufe0e',
-                        u'\ufe0f']
+VARIATION_SELECTORS = [u'\ufe00',
+                       u'\ufe01',
+                       u'\ufe02',
+                       u'\ufe03',
+                       u'\ufe04',
+                       u'\ufe05',
+                       u'\ufe06',
+                       u'\ufe07',
+                       u'\ufe08',
+                       u'\ufe09',
+                       u'\ufe0a',
+                       u'\ufe0b',
+                       u'\ufe0c',
+                       u'\ufe0d',
+                       u'\ufe0e',
+                       u'\ufe0f']
 
 # from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
 ALL_CHARS = (unichr(i) for i in xrange(sys.maxunicode))
-CONTROL_CHARS = ''.join(map(unichr, range(0,32) + range(127,160)))
+CONTROL_CHARS = ''.join(map(unichr, range(0, 32) + range(127, 160)))
 CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
 
+
 def is_special_token(word):
     equal = False
     for spec in SPECIAL_TOKENS:
@@ -43,6 +44,7 @@ def is_special_token(word):
             break
     return equal
 
+
 def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
     """ Ensure text meets threshold for containing English words """
 
@@ -71,6 +73,7 @@ def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_s
         valid_english = n_english >= n_words * pct_eng_long
     return valid_english, n_words, n_english
 
+
 def correct_length(words, min_words, max_words, ignore_special_tokens=True):
     """ Ensure text meets threshold for containing English words
         and that it's within the min and max words limits. """
@@ -91,17 +94,21 @@ def correct_length(words, min_words, max_words, ignore_special_tokens=True):
     valid = min_words <= n_words and n_words <= max_words
     return valid
 
+
 def punct_word(word, punctuation=string.punctuation):
     return all([True if c in punctuation else False for c in word])
 
+
 def load_non_english_user_set():
     non_english_user_set = set(np.load('uids.npz')['data'])
     return non_english_user_set
 
+
 def non_english_user(userid, non_english_user_set):
     neu_found = int(userid) in non_english_user_set
     return neu_found
 
+
 def separate_emojis_and_text(text):
     emoji_chars = []
     non_emoji_chars = []
@@ -112,10 +119,12 @@ def separate_emojis_and_text(text):
             non_emoji_chars.append(c)
     return ''.join(emoji_chars), ''.join(non_emoji_chars)
 
+
 def extract_emojis(text, wanted_emojis):
     text = remove_variation_selectors(text)
     return [c for c in text if c in wanted_emojis]
 
+
 def remove_variation_selectors(text):
     """ Remove styling glyph variants for Unicode characters.
         For instance, remove skin color from emojis.
@@ -124,6 +133,7 @@ def remove_variation_selectors(text):
         text = text.replace(var, u'')
     return text
 
+
 def shorten_word(word):
     """ Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
     """
@@ -147,10 +157,11 @@ def shorten_word(word):
     # replace letters to find the short word
     short_word = word
     for trip in triple_or_more:
-        short_word = short_word.replace(trip, trip[0]*2)
+        short_word = short_word.replace(trip, trip[0] * 2)
 
     return short_word
 
+
 def detect_special_tokens(word):
     try:
         int(word)
@@ -162,22 +173,26 @@ def detect_special_tokens(word):
             word = SPECIAL_TOKENS[3]
     return word
 
+
 def process_word(word):
     """ Shortening and converting the word to a special token if relevant.
     """
     word = shorten_word(word)
     word = detect_special_tokens(word)
     return word
 
+
 def remove_control_chars(text):
     return CONTROL_CHAR_REGEX.sub('', text)
 
+
 def convert_nonbreaking_space(text):
     # ugly hack handling non-breaking space no matter how badly it's been encoded in the input
     for r in [u'\\\\xc2', u'\\xc2', u'\xc2', u'\\\\xa0', u'\\xa0', u'\xa0']:
         text = text.replace(r, u' ')
     return text
 
+
 def convert_linebreaks(text):
     # ugly hack handling non-breaking space no matter how badly it's been encoded in the input
     # space around to ensure proper tokenization