# coding: utf-8 # In[1]: import numpy, sys, os, pandas as pd from random import randint from pickle import dump, load # In[2]: # load doc into memory def load_doc(filename): # open the file as read only file = open(filename, 'r') # read all text text = file.read() # close the file file.close() tokens = text.split() print(tokens[:100]) print('Total Tokens: %d' % len(tokens)) print('Unique Tokens: %d' % len(set(tokens))) return tokens # In[3]: # organize into sequences of tokens #the plus one is because the last val in the list will be the expected prediction. #Its our Y-train def sequencesCreate(length, tokens): from keras.preprocessing.text import Tokenizer sequences = list() for i in range(length, len(tokens)): # select sequence of tokens seq = tokens[i-length:i] # convert into a line #line = ' '.join(seq) # store sequences.append(seq) print('Total Sequences: %d' % len(sequences)) print(f'sequences[0][0]: {sequences[0][0]}') tokenizer = Tokenizer() # integer encode sequences of words #sequences = [str(i) for i in sequences] # print(f'tokenizer: {tokenizer}') tokenizer.fit_on_texts(sequences) # print(f'tokenizer: {tokenizer}') sequences = tokenizer.texts_to_sequences(sequences) # print(f'sequences: {sequences}') return sequences, tokenizer # In[4]: def modelFit(model, modelName, X, y, seq_length, batch_size, epochs, results_path): from keras.callbacks import ModelCheckpoint # compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # define the checkpoint filepath=f"{results_path.rstrip('/').lstrip('/')}/wi_{{epoch:02d}}_{{loss:.4f}}_{modelName}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] # fit model history_callback = model.fit(X, y, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list) return history_callback # In[12]: #--- --- ---- --- ---- --- ---- ---- --- ----- ---- --- # -- Write Files ---- ---- ---- --- ---- --- --- --- -- #--- --- ---- --- ---- --- ---- ---- --- ----- ---- --- def writeFiles(modelName, modelList, seq_length, total_sequences, epochs, batch_size, results_path): model_info = {} #history_callback.history model_info['seq_length'] = seq_length model_info['total_sequences'] = total_sequences model_info['batch_size'] = batch_size model_info['epochs'] = epochs # save losses rFile = results_path.rstrip('/').lstrip('/') + '/info_' + modelName + '.txt' print(f'Info File: {rFile}') with open(rFile,'w+') as f: f.write(str(modelList)) f.write('\n') f.write(str(model_info)) # In[6]: # define model def defineModel(vocab_size, seq_length, modelList, length, input_shape): from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import LSTM from keras.utils import np_utils from keras.layers import Embedding, Flatten model = Sequential() #-- EMBEDDED LAYER --- --- --- ---- -- #input_dim: size of the vocabulary in the text data. #output_dim: size of the vector space where words will be embedded. or size of the output vectors from this layer try 32 or 100 or larger #input_length: length of input seq's. ex: if input documents are comprised of 1000 words, it would be 1000. # modelList = [{'model':'Embedding', 'input_dim':vocab_size, 'output_dim': 100, 'input_length': seq_length}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': .2}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': .2}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model':'Flatten'}, # {'model': 'Dense','units':vocab_size,'activation':'softmax'}, # ] for i,layer in enumerate(modelList): if layer['model'] == 'Embedding': model.add(Embedding(input_dim=layer['input_dim'], output_dim=layer['output_dim'], input_length=layer['input_length'])) print(f"model.add(Embedding(input_dim= {layer['input_dim']}, output_dim={ layer['output_dim'] }, input_length={ layer['input_length'] }))") elif layer['model'] == 'LSTM': #model.add(LSTM(100, return_sequences=True)) #model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, input_dim=1)) model.add(LSTM(units=layer['units'], use_bias=layer['use_bias'], dropout=layer['dropout'], recurrent_dropout=layer['recurrent_dropout'], return_sequences = layer['return_sequences'])) print(f"model.add(LSTM(units={layer['units']}, use_bias={layer['use_bias']}, dropout={layer['dropout']}, recurrent_dropout={layer['recurrent_dropout']} ))") elif layer['model'] == 'Dropout': #model.add(Dropout(0.2)) model.add(Dropout(layer['dropout_rate'])) print(f"model.add(Dropout({layer['dropout_rate']}))") elif layer['model'] == 'Dense': #{'model': 'Dense','units':64,'activation':relu'}, #model.add(Dense(100, activation='relu')) model.add(Dense(units=layer['units'], activation=layer['activation'])) print(f"model.add(Dense(units={layer['units']}, activation={layer['activation']}))") elif layer['model'] == 'Flatten': model.add(Flatten()) print(f'model.add(Flatten())') else: raise IOError ('invalid layer') #Create the model name import datetime now = datetime.datetime.now() modelName = now.strftime("%Y-%m-%d_%H-%M") try: print(model.summary()) except: pass return model, modelName # In[13]: def trainModelComplete(results_path): from keras.utils import to_categorical #--- PARAMETERS --- --- --- ---- --- --- ---- ---- --- ----- --- --- ---- #notes from website: #-- Common values are 50, 100, and 300. We will use 50 here, -- #-- but consider testing smaller or larger values. -- #-- We will use a two LSTM hidden layers with 100 memory cells each. -- #-- More memory cells and a deeper network may achieve better results. -- drseuss_text = 'data/combinedText.txt' seed_length = 50 length = seed_length + 1 epochs = 2 batch_size = 128 #-- ---- ---- --- ---- ----- ---- ----- ---- ----- ----- ---- ---- ---- ---- #-- load document --- --- --- --- -- drseuss_text = 'data/combinedText.txt' tokens = load_doc(drseuss_text) #-- Create sequencer and tokenizer -- --- --- --- --- --- --- --- sequences, tokenizer = sequencesCreate(length, tokens) vocab_size = len(tokenizer.word_index) + 1 #-- Creating X, y -- --- --- --- --- --- --- -- -- df = pd.DataFrame(sequences) print(f'sequences:\n{df.head(5)}') X, y = df.iloc[:,:-1], df.iloc[:,-1] seq_length = X.shape[1] input_shape = X.shape #-- One hot encoding -- --- --- --- --- --- - y = to_categorical(y, num_classes=vocab_size) print(f'seq_length: {seq_length}\nshape of X: {X.shape}\nshape of y: {y.shape}') #-- -- ---- --- --- --- --- --- ---- --- --- --- -- #-- Model List --- --- --- --- --- --- --- --- --- -- ---- --- --- --- ---- -- -- # modelList = [{'model':'Embedding', 'input_dim':vocab_size, 'output_dim': 256, 'input_length': seq_length}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model':'Flatten'}, # {'model': 'Dense','units':vocab_size,'activation':'softmax'}, # ] modelList = [{'model':'Embedding', 'input_dim':vocab_size, 'output_dim': 512, 'input_length': seq_length}, {'model': 'LSTM', 'units':512, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, {'model': 'Dense','units':100,'activation':'relu'}, # {'model': 'LSTM', 'units':512, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, # {'model': 'Dense','units':100,'activation':'relu'}, {'model':'Flatten'}, {'model': 'Dense','units':vocab_size,'activation':'softmax'}, ] #-- --- ---- --- ---- --- --- ---- --- ---- --- ---- --- ---- --- --- --- --- --- print(f'drseuss_text: \'{drseuss_text}\'\nseed_length: {seed_length}\nepochs: {epochs}\nbatch_size: {batch_size}' f'\nmodelList: {modelList}') #-- Create Model -- --- --- --- ---- --- -- ---- --- --- --- --- --- --- ---- --- --- model, modelName = defineModel(vocab_size, seq_length, modelList, length, input_shape) #-- save the tokenizer --- --- --- ---- --- --- ---- -- dump(tokenizer, open(results_path.rstrip('/').lstrip('/') + f'/token_'+modelName+'.pkl', 'wb')) #-- Save history and final model --- - writeFiles(modelName, modelList, seq_length, len(sequences), epochs, batch_size, results_path) #-- Fit model -- ---- --- --- --- ---- --- --- ---- --- --- --- --- --- --- --- --- history_callback = modelFit(model, modelName, X, y, seq_length, batch_size, epochs, results_path) loss_history = history_callback.history with open(results_path.rstrip('/').lstrip('/') + f'/loss_history_{modelName}.txt', 'w+') as f: f.write(str(loss_history)) # In[8]: # generate a sequence from a language model #def generate_seq(model, tokenizer, seq_length, seed_text, n_words): def generate_seq(seq_length, seed_text, n_words, filepath = '', modelName = '', tokenizerName = '', ): from keras.models import load_model from keras.preprocessing.sequence import pad_sequences import re if filepath : highest_epoch = 0 for filename in os.listdir(filepath): m = re.search('^wi_(\d+)_', filename) if m: if int(m.group(1)) > highest_epoch: highest_epoch = int(m.group(1)) modelName = filepath+'/'+filename if re.search('token', filename): tokenizerName = filepath+'/'+filename # load the model model = load_model(modelName) # load the tokenizer tokenizer = load(open(tokenizerName, 'rb')) #Make 50 words long seed_text = ' '.join(seed_text.split(' ')[0:seq_length]) result = list() in_text = seed_text # generate a fixed number of words for _ in range(n_words): # encode the text as integer encoded = tokenizer.texts_to_sequences([in_text])[0] # truncate sequences to a fixed length encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') # predict probabilities for each word yhat = model.predict_classes(encoded, verbose=0) # map predicted word index to word out_word = '' for word, index in tokenizer.word_index.items(): if index == yhat: out_word = word break # append to input in_text += ' ' + out_word result.append(out_word) del model return ' '.join(result) # In[9]: # modelList = [{'model':'Embedding', 'input_dim':2830, 'output_dim': 256, 'input_length': 50}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model': 'LSTM', 'units':256, 'use_bias':True, 'dropout':.2, 'recurrent_dropout': 0, 'return_sequences': True}, # {'model': 'Dense','units':64,'activation':'relu'}, # {'model':'Flatten'}, # {'model': 'Dense','units':2830,'activation':'softmax'}, # ] # history_callback = {'history':{'loss': [6.8130, 6.3438, 6.0809, 5.6680, 5.0674, 4.1888, 3.2263, 2.4416, 1.8358, 1.3483, 0.9936, 0.7174, 0.5278, 0.3948, 0.2838, 0.2132, 0.1515, 0.1078, 0.0862, 0.0653, 0.0591, 0.0499, 0.0395, 0.0275, 0.0271, 0.0293, 0.0370, 0.0441, 0.0782, 0.1003, 0.0644, 0.0407, 0.0296, 0.0202, 0.0133, 0.0067, 0.0048, 0.0053, 0.0050, 0.0076, 0.0120, 0.0162, 0.0466, 0.1344, 0.1101, 0.0600, 0.0288, 0.0118, 0.0063], # 'acc': [0.0366, 0.0477, 0.0514, 0.0527, 0.0647, 0.1239, 0.1239, 0.4201, 0.5374, 0.6495, 0.7304, 0.7957, 0.8472, 0.8845, 0.9168, 0.9394, 0.9593, 0.9714, 0.9791, 0.9854, 0.9866, 0.9900, 0.9918, 0.9948, 0.9946, 0.9945, 0.9913, 0.9897, 0.9782, 0.9701, 0.9821, 0.9881, 0.9925, 0.9952, 0.9975, 0.9991, 0.9997, 0.9996, 0.9996, 0.9987, 0.9984, 0.9962, 0.9854, 0.9570, 0.9661, 0.9805, 0.9917, 0.9974, 0.9993]}} # writeFiles('NULL', '2018-10-22_11-31', history_callback, modelList, 50, total_sequences = 16175) # In[11]: if __name__ == '__main__': trainModelComplete('.') # In[ ]: #trainModelComplete() # In[ ]: def json_create(filepath = '.'): import os, ast, json, re, seed datetime = {} #-- Determine JSON file name -- jsonFile = f'{filepath}/Alldata.json'; i = '0' while os.path.isfile(jsonFile): i = str(int(i)+1) jsonFile = f"{filepath}/Alldata{i}.json" for filename in os.listdir(filepath): #wi_01_6.7077__2018-10-22_09-29.hdf5 m = re.search('wi_(..)_(......)__*(....-..-..)_(..-..).hdf5', filename) if m: epoch, loss, date, time = m.group(1), m.group(2), m.group(3), m.group(4) if date+'_'+time not in datetime.keys(): #print(f"{date+'_'+time} not in KEYS: \n{datetime.keys()}") tokenizer = filepath+f'/token_{date}_{time}.pkl' try: with open(filepath.rstrip('/').lstrip('/')+'/info_' + date+'_'+time + '.txt') as f: text = f.read() modelList = text.split(']')[0] + ']' modelHistory = '{' + ']'.join(text.split(']')[1:]).split('{')[1] #print(f"NEW DATA: {date+'_'+time}") modelHistory = ast.literal_eval(modelHistory) modelList = ast.literal_eval(modelList) epochs = modelHistory['epochs'] if os.path.isfile(f"{date+'_'+time}_loss_history.txt"): with open(f"{filepath.rstrip('/').lstrip('/')}/{date+'_'+time}_loss_history.txt") as f: model_history = f.read() model_history = ast.literal_eval(model_history) modelHistory['model_history'] = model_history except: modelList = [] modelHistory = {} datetime[date+'_'+time] = {'model_list': modelList, 'model_history': modelHistory, 'sequence_list': ['no_model_data']*(epochs+1)} try: seq_length = modelHistory['seq_length'] except: seq_length = 50 #print(f'{epoch}: {datetime}') #seq_length, seed_text, n_words, filepath = '', modelName = '', tokenizerName = '', ) datetime[date+'_'+time]['sequence_list'][int(epoch)] = generate_seq(seq_length, seed.seed_text, 50, filepath, filename, tokenizer) print('\n',filename, ": ",datetime[date+'_'+time]['sequence_list'][int(epoch)]) #-- Write JSON file -- --- ---- with open(jsonFile, 'w+') as fp: json.dump(datetime, fp) # In[ ]: #wi_76_0.0010__51_LSTM_256_True_Dense_256_relu_Dropout_0.2__LSTM_128_True_Dense_128_relu_Dropout_0.2__LSTM_64_False_Dense_64_relu_Flatten___Dense_2830_softmax.hdf def jsonify_the_old_style_file(filepath = '.'): import seed, re, os, json jsonFile = filepath + '/Alldata.json'; i = '0' #-- Determine JSON file name -- while os.path.isfile(jsonFile): i = str(int(i)+1) jsonFile = f"{filepath}/Alldata{i}.json" tokenizer = filepath + '/toke_51_LSTM_256_True_Dense_256_relu_Dropout_0.2__LSTM_128_True_Dense_128_relu_Dropout_0.2__LSTM_64_False_Dense_64_relu_Flatten___Dense_2830_softmax.pkl' jsondict = {'sequences': ['no_data']*112, 'model':None, 'loss': ['no_data']*112} for filename in os.listdir(filepath): m = re.search('wi_(..)_(......)__(.*).hdf5', filename) if m and re.search('51_LSTM_256_True_Dense_256_relu_Dropout_0.2__LSTM_128_True_Dense_128_relu_Dropout_0.2__LSTM_64_False_Dense_64_relu_Flatten___Dense_2830_softmax', filename): epoch, loss, modellist = m.group(1), m.group(2), m.group(3) jsondict['model'] = modellist jsondict['loss'][int(epoch)] = float(loss) #seq_length, seed_text, n_words, filepath = '', modelName = '', tokenizerName = '', ) jsondict['sequences'][int(epoch)] = generate_seq(50, seed.seed_text, 50, os.path.join(filepath,filename), tokenizer, 50, seed.seed_text, 50) print(epoch, ': ', jsondict['sequences'][int(epoch)]) #-- Write JSON file -- --- ---- with open(jsonFile, 'w+') as fp: json.dump(jsondict, fp)