diff --git a/ch16/ch16_part1.ipynb b/ch16/ch16_part1.ipynb index e435e766..fccd2741 100644 --- a/ch16/ch16_part1.ipynb +++ b/ch16/ch16_part1.ipynb @@ -4,7 +4,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Chapter 16\n", + "*Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com) & [Vahid Mirjalili](http://vahidmirjalili.com), Packt Publishing Ltd. 2019\n", + "\n", + "Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition\n", + "\n", + "Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 1/2)\n", "========\n" ] }, @@ -12,7 +23,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Introduction" + "# Introducing sequential data\n", + "\n", + "## Modeling sequential data⁠—order matters\n", + "\n", + "## Representing sequences\n", + "\n", + "## The different categories of sequence modeling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RNNs for modeling sequences\n", + "\n", + "## Understanding the RNN looping mechanism\n", + "\n", + "## Computing activations in an RNN\n", + "\n", + "## Hidden-recurrence vs. output-recurrence\n" ] }, { @@ -112,8 +142,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## The challenges of learning long-range interactions\n", "\n", - "# Project 1: Sentiment Analysis" + "## Long Short-Term Memory cells " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementing RNNs for sequence modeling in TensorFlow\n", + "\n", + "## Project one: predicting the sentiment of IMDb movie reviews\n", + "\n", + "### Preparing the movie review data\n", + "\n" ] }, { @@ -240,7 +283,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Train/validaiton/test splits" + " * **Train/validaiton/test splits**" ] }, { @@ -264,17 +307,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Tokenizer and Encoder\n", - "\n", - " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n", - " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder" + " * **Tokenizer and Encoder**\n", + " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n", + " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Encoding sequences: keeping the last 100 items in each sequence" + " * **Encoding sequences: keeping the last 100 items in each sequence**" ] }, { @@ -322,7 +364,7 @@ } ], "source": [ - "## Step 3: endoding each unique token into an integer\n", + "## Step 3: endoding each unique token into integers\n", "\n", "encoder = tfds.features.text.TokenTextEncoder(token_counts)\n", "\n", @@ -408,7 +450,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Batch" + " * **batch() vs. padded_batch()**" ] }, { @@ -500,9 +542,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Building an RNN layer:\n", - "\n", - "### Embedding layer\n", + "#### Embedding layers for sentence encoding\n", "\n", "\n", " * `input_dim`: number of words, i.e. maximum integer index + 1.\n", @@ -558,19 +598,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Different RNN Layers:\n", + "### Building an RNN model\n", "\n", - " * `tf.keras.layers.SimpleRNN(units, return_sequences=False)`\n", - " * `tf.keras.layers.LSTM(..)`\n", - " * `tf.keras.layers.GRU(..)`\n", - " * `tf.keras.layers.Bidirectional()`\n", + "* **Keras RNN layers:**\n", + " * `tf.keras.layers.SimpleRNN(units, return_sequences=False)`\n", + " * `tf.keras.layers.LSTM(..)`\n", + " * `tf.keras.layers.GRU(..)`\n", + " * `tf.keras.layers.Bidirectional()`\n", " \n", - "**Determine `return_sequenes=?`**\n", - " * In a multi-layer RNN, all RNN layers except the last one should have `return_sequenes=True`\n", - " * For the last RNN layer, decide based on the type of problem: \n", - " * many-to-many: -> `return_sequences=True`\n", - " * many-to-one : -> `return_sequenes=False`\n", - " * ..\n", + "* **Determine `return_sequenes=?`**\n", + " * In a multi-layer RNN, all RNN layers except the last one should have `return_sequenes=True`\n", + " * For the last RNN layer, decide based on the type of problem: \n", + " * many-to-many: -> `return_sequences=True`\n", + " * many-to-one : -> `return_sequenes=False`\n", + " * ..\n", " " ] }, @@ -710,7 +751,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Bidirectional LSTM using full-length sequences" + "### Building an RNN model for the sentiment analysis task" ] }, { @@ -817,7 +858,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## SimpleRNN with short sequences" + " * **Trying SimpleRNN with short sequences**" ] }, { @@ -1206,7 +1247,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Appendix\n" + "# Appendix\n" ] }, { @@ -1364,6 +1405,33 @@ "tf.print(embed(np.array([1])))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Readers may ignore the next cell.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook ch16_part1.ipynb to script\n", + "[NbConvertApp] Writing 16880 bytes to ch16_part1.py\n" + ] + } + ], + "source": [ + "! python ../.convert_notebook_to_script.py --input ch16_part1.ipynb --output ch16_part1.py" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/ch16/ch16_part1.py b/ch16/ch16_part1.py new file mode 100644 index 00000000..53dc9056 --- /dev/null +++ b/ch16/ch16_part1.py @@ -0,0 +1,700 @@ +# coding: utf-8 + + +import tensorflow as tf +import tensorflow_datasets as tfds +import numpy as np +import pandas as pd +from collections import Counter +from tensorflow.keras.layers import Embedding +from tensorflow.keras import Sequential +from tensorflow.keras.layers import SimpleRNN +from tensorflow.keras.layers import Dense +from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import GRU +from tensorflow.keras.layers import Bidirectional + +# *Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com) & [Vahid Mirjalili](http://vahidmirjalili.com), Packt Publishing Ltd. 2019 +# +# Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition +# +# Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt) + +# Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 1/2) +# ======== +# + +# # Introducing sequential data +# +# ## Modeling sequential data⁠—order matters +# +# ## Representing sequences +# +# ## The different categories of sequence modeling + +# # RNNs for modeling sequences +# +# ## Understanding the RNN looping mechanism +# +# ## Computing activations in an RNN +# +# ## Hidden-recurrence vs. output-recurrence +# + + + +tf.random.set_seed(1) + +rnn_layer = tf.keras.layers.SimpleRNN( + units=2, use_bias=True, + return_sequences=True) +rnn_layer.build(input_shape=(None, None, 5)) + +w_xh, w_oo, b_h = rnn_layer.weights + +print('W_xh shape: ', w_xh.shape) +print('W_oo shape: ', w_oo.shape) +print('b_h shape: ', b_h.shape) + + + + +x_seq = tf.convert_to_tensor( + [[1.0]*5, [2.0]*5, [3.0]*5], + dtype=tf.float32) + + +## output of SimepleRNN: +output = rnn_layer(tf.reshape(x_seq, shape=(1, 3, 5))) + +## manually computing the output: +out_man = [] +for t in range(len(x_seq)): + xt = tf.reshape(x_seq[t], (1, 5)) + print('Time step {} =>'.format(t)) + print(' Input : ', xt.numpy()) + + ht = tf.matmul(xt, w_xh) + b_h + print(' Hidden : ', ht.numpy()) + + if t>0: + prev_o = out_man[t-1] + else: + prev_o = tf.zeros(shape=(ht.shape)) + + ot = ht + tf.matmul(prev_o, w_oo) + ot = tf.math.tanh(ot) + out_man.append(ot) + print(' Output (manual) : ', ot.numpy()) + print(' SimpleRNN output: '.format(t), output[0][t].numpy()) + print() + + +# ## The challenges of learning long-range interactions +# +# ## Long Short-Term Memory cells + +# # Implementing RNNs for sequence modeling in TensorFlow +# +# ## Project one: predicting the sentiment of IMDb movie reviews +# +# ### Preparing the movie review data +# +# + + + + + + + +df = pd.read_csv('movie_data.csv', encoding='utf-8') + +df.tail() + + + + +# Step 1: Create a dataset + +target = df.pop('sentiment') + +ds_raw = tf.data.Dataset.from_tensor_slices( + (df.values, target.values)) + +## inspection: +for ex in ds_raw.take(3): + tf.print(ex[0].numpy()[0][:50], ex[1]) + + +# * **Train/validaiton/test splits** + + + +tf.random.set_seed(1) + +ds_raw = ds_raw.shuffle( + 50000, reshuffle_each_iteration=False) + +ds_raw_test = ds_raw.take(25000) +ds_raw_train_valid = ds_raw.skip(25000) +ds_raw_train = ds_raw_train_valid.take(20000) +ds_raw_valid = ds_raw_train_valid.skip(20000) + + +# * **Tokenizer and Encoder** +# * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer +# * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder + +# * **Encoding sequences: keeping the last 100 items in each sequence** + + + +## Step 2: find unique words + + +tokenizer = tfds.features.text.Tokenizer() +token_counts = Counter() + +for example in ds_raw_train: + tokens = tokenizer.tokenize(example[0].numpy()[0]) + token_counts.update(tokens) + +print('Vocab-size: ', len(token_counts)) + + + + +## Step 3: endoding each unique token into integers + +encoder = tfds.features.text.TokenTextEncoder(token_counts) + +example_str = 'This is an example!' +encoder.encode(example_str) + + + + + +## Step 3-A: define the function for transformation +def encode(text_tensor, label): + text = text_tensor.numpy()[0] + encoded_text = encoder.encode(text) + return encoded_text, label + +## Step 3-B: wrap the encode function to a TF Op. +def encode_map_fn(text, label): + return tf.py_function(encode, inp=[text, label], + Tout=(tf.int64, tf.int64)) + + + + +ds_train = ds_raw_train.map(encode_map_fn) +ds_valid = ds_raw_valid.map(encode_map_fn) +ds_test = ds_raw_test.map(encode_map_fn) + +tf.random.set_seed(1) +for example in ds_train.shuffle(1000).take(5): + print('Sequence length: ', example[0].shape) + +example + + +# * **batch() vs. padded_batch()** + + + +# ###### ### ### # ### +# # # # # # # # # # +# # # # # # # # # # +# ##### ##### ##### # # ##### +# # # # # # # # # # +# # # # # # # # # # +# ###### # # # # # # +# +# this will result in error +BATCH_SIZE = 32 +train_data = all_encoded_data.batch(BATCH_SIZE) + +next(iter(train_data)) + +# Running this will result in error +# We cannot apply .batch() to this dataset + +# ###### ### ### # ### +# # # # # # # # # # +# # # # # # # # # # +# ##### ##### ##### # # ##### +# # # # # # # # # # +# # # # # # # # # # +# ###### # # # # # # + + + + +## Take a small subset +ds_subset = ds_train.take(8) +for example in ds_subset: + print('Individual Shape: ', example[0].shape) + +## batching the datasets +ds_batched = ds_subset.padded_batch( + 4, padded_shapes=([-1], [])) + +for batch in ds_batched: + print('Batch Shape: ', batch[0].shape) + + + + +## batching the datasets +train_data = ds_train.padded_batch( + 32, padded_shapes=([-1],[])) + +valid_data = ds_valid.padded_batch( + 32, padded_shapes=([-1],[])) + +test_data = ds_test.padded_batch( + 32, padded_shapes=([-1],[])) + + +# #### Embedding layers for sentence encoding +# +# +# * `input_dim`: number of words, i.e. maximum integer index + 1. +# * `output_dim`: +# * `input_length`: the length of (padded) sequence +# * for example, `'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]` +# => input_lenght is 10 +# +# +# +# * When calling the layer, takes integr values as input, +# the embedding layer convert each interger into float vector of size `[output_dim]` +# * If input shape is `[BATCH_SIZE]`, output shape will be `[BATCH_SIZE, output_dim]` +# * If input shape is `[BATCH_SIZE, 10]`, output shape will be `[BATCH_SIZE, 10, output_dim]` + + + + +model = tf.keras.Sequential() + +model.add(Embedding(input_dim=100, + output_dim=6, + input_length=20, + name='embed-layer')) + +model.summary() + + +# ### Building an RNN model +# +# * **Keras RNN layers:** +# * `tf.keras.layers.SimpleRNN(units, return_sequences=False)` +# * `tf.keras.layers.LSTM(..)` +# * `tf.keras.layers.GRU(..)` +# * `tf.keras.layers.Bidirectional()` +# +# * **Determine `return_sequenes=?`** +# * In a multi-layer RNN, all RNN layers except the last one should have `return_sequenes=True` +# * For the last RNN layer, decide based on the type of problem: +# * many-to-many: -> `return_sequences=True` +# * many-to-one : -> `return_sequenes=False` +# * .. +# + + + +## An example of building a RNN model +## with SimpleRNN layer + +model = Sequential() +model.add(Embedding(1000, 32)) +model.add(SimpleRNN(32, return_sequences=True)) +model.add(SimpleRNN(32)) +model.add(Dense(1)) +model.summary() + + + + +## An example of building a RNN model +## with LSTM layer + +model = Sequential() +model.add(Embedding(10000, 32)) +model.add(LSTM(32, return_sequences=True)) +model.add(LSTM(32)) +model.add(Dense(1)) +model.summary() + + + + +## An example of building a RNN model +## with GRU layer + +model = Sequential() +model.add(Embedding(10000, 32)) +model.add(GRU(32, return_sequences=True)) +model.add(GRU(32)) +model.add(Dense(1)) +model.summary() + + +# ### Building an RNN model for the sentiment analysis task + + + + +embedding_dim = 20 +vocab_size = len(token_counts) + 2 + +tf.random.set_seed(1) + +## build the model +bi_lstm_model = tf.keras.Sequential([ + tf.keras.layers.Embedding( + input_dim=vocab_size, + output_dim=embedding_dim, + name='embed-layer'), + + tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(64, name='lstm-layer'), + name='bidir-lstm'), + + tf.keras.layers.Dense(64, activation='relu'), + + tf.keras.layers.Dense(1, activation='sigmoid') +]) + +bi_lstm_model.summary() + +## compile and train: +bi_lstm_model.compile( + optimizer=tf.keras.optimizers.Adam(1e-3), + loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), + metrics=['accuracy']) + +history = bi_lstm_model.fit( + train_data, + validation_data=valid_data, + epochs=10) + +## evaluate on the test data +test_results= bi_lstm_model.evaluate(test_data) +print('Test Acc.: {:.2f}%'.format(test_results[1]*100)) + + + + +bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.h5') + + +# * **Trying SimpleRNN with short sequences** + + + + +def preprocess_datasets( + ds_raw_train, + ds_raw_valid, + ds_raw_test, + max_seq_length=None, + batch_size=32): + + ## Step 1: (already done => creating a dataset) + ## Step 2: find unique tokens + tokenizer = tfds.features.text.Tokenizer() + token_counts = Counter() + + for example in ds_raw_train: + tokens = tokenizer.tokenize(example[0].numpy()[0]) + if max_seq_length is not None: + tokens = tokens[-max_seq_length:] + token_counts.update(tokens) + + print('Vocab-size: ', len(token_counts)) + + + ## Step 3: encoding the texts + encoder = tfds.features.text.TokenTextEncoder(token_counts) + def encode(text_tensor, label): + text = text_tensor.numpy()[0] + encoded_text = encoder.encode(text) + if max_seq_length is not None: + encoded_text = encoded_text[-max_seq_length:] + return encoded_text, label + + def encode_map_fn(text, label): + return tf.py_function(encode, inp=[text, label], + Tout=(tf.int64, tf.int64)) + + ds_train = ds_raw_train.map(encode_map_fn) + ds_valid = ds_raw_valid.map(encode_map_fn) + ds_test = ds_raw_test.map(encode_map_fn) + + ## Step 4: batching the datasets + train_data = ds_train.padded_batch( + batch_size, padded_shapes=([-1],[])) + + valid_data = ds_valid.padded_batch( + batch_size, padded_shapes=([-1],[])) + + test_data = ds_test.padded_batch( + batch_size, padded_shapes=([-1],[])) + + return (train_data, valid_data, + test_data, len(token_counts)) + + + + + + +def build_rnn_model(embedding_dim, vocab_size, + recurrent_type='SimpleRNN', + n_recurrent_units=64, + n_recurrent_layers=1, + bidirectional=True): + + tf.random.set_seed(1) + + # build the model + model = tf.keras.Sequential() + + model.add( + Embedding( + input_dim=vocab_size, + output_dim=embedding_dim, + name='embed-layer') + ) + + for i in range(n_recurrent_layers): + return_sequences = (i < n_recurrent_layers-1) + + if recurrent_type == 'SimpleRNN': + recurrent_layer = SimpleRNN( + units=n_recurrent_units, + return_sequences=return_sequences, + name='simprnn-layer-{}'.format(i)) + elif recurrent_type == 'LSTM': + recurrent_layer = LSTM( + units=n_recurrent_units, + return_sequences=return_sequences, + name='lstm-layer-{}'.format(i)) + elif recurrent_type == 'GRU': + recurrent_layer = GRU( + units=n_recurrent_units, + return_sequences=return_sequences, + name='gru-layer-{}'.format(i)) + + if bidirectional: + recurrent_layer = Bidirectional( + recurrent_layer, name='bidir-'+recurrent_layer.name) + + model.add(recurrent_layer) + + model.add(tf.keras.layers.Dense(64, activation='relu')) + model.add(tf.keras.layers.Dense(1, activation='sigmoid')) + + return model + + + + + +batch_size = 32 +embedding_dim = 20 +max_seq_length = 100 + +train_data, valid_data, test_data, n = preprocess_datasets( + ds_raw_train, ds_raw_valid, ds_raw_test, + max_seq_length=max_seq_length, + batch_size=batch_size +) + + +vocab_size = n + 2 + +rnn_model = build_rnn_model( + embedding_dim, vocab_size, + recurrent_type='SimpleRNN', + n_recurrent_units=64, + n_recurrent_layers=1, + bidirectional=True) + +rnn_model.summary() + + + + +rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), + loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), + metrics=['accuracy']) + + +history = rnn_model.fit( + train_data, + validation_data=valid_data, + epochs=10) + + + + +rnn_model.evaluate(test_data) + + + + +print('Test Acc.: {:.2f}%'.format(results[1]*100)) + + +# ## Optional exercise: +# +# ### Uni-directional SimpleRNN with full-length sequences + + + + +batch_size = 32 +embedding_dim = 20 +max_seq_length = None + +train_data, valid_data, test_data, n = preprocess_datasets( + ds_raw_train, ds_raw_valid, ds_raw_test, + max_seq_length=max_seq_length, + batch_size=batch_size +) + + +vocab_size = n + 2 + +rnn_model = build_rnn_model( + embedding_dim, vocab_size, + recurrent_type='SimpleRNN', + n_recurrent_units=64, + n_recurrent_layers=1, + bidirectional=False) + +rnn_model.summary() + + + + +rnn_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), + loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), + metrics=['accuracy']) + +history = rnn_model.fit( + train_data, + validation_data=valid_data, + epochs=10) + + + + + + + +# + +# # Appendix +# + +# ### A- An alterntaive way to get the dataset: using tensorflow_datasets + + + + +imdb_bldr = tfds.builder('imdb_reviews') +print(imdb_bldr.info) + +imdb_bldr.download_and_prepare() + +datasets = imdb_bldr.as_dataset(shuffle_files=False) + +datasets.keys() + + +# ### B- Tokenizer and Encoder +# +# * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer +# * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder +# +# + + + +vocab_set = {'a', 'b', 'c', 'd'} +encoder = tfds.features.text.TokenTextEncoder(vocab_set) +print(encoder) + +print(encoder.encode(b'a b c d, , : .')) + +print(encoder.encode(b'a b c d e f g h i z')) + + +# ### C- Text Pre-processing with Keras + + + +TOP_K = 200 +MAX_LEN = 10 + +tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K) + +tokenizer.fit_on_texts(['this is an example', 'je suis en forme ']) +sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme ']) +print(sequences) + +tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN) + + + + +#### +TOP_K = 20000 +MAX_LEN = 500 + +tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K) + +tokenizer.fit_on_texts( + [example['text'].numpy().decode('utf-8') + for example in imdb_train]) + +x_train = tokenizer.texts_to_sequences( + [example['text'].numpy().decode('utf-8') + for example in imdb_train]) + +print(len(x_train)) + + +x_train_padded = tf.keras.preprocessing.sequence.pad_sequences( + x_train, maxlen=MAX_LEN) + +print(x_train_padded.shape) + + +# ### D- Embedding +# +# + + + + +tf.random.set_seed(1) +embed = Embedding(input_dim=100, output_dim=4) + +inp_arr = np.array([1, 98, 5, 6, 67, 45]) +tf.print(embed(inp_arr)) +tf.print(embed(inp_arr).shape) + +tf.print(embed(np.array([1]))) + + + + + + diff --git a/ch16/ch16_part2.ipynb b/ch16/ch16_part2.ipynb index 1778e8f7..1b110d96 100644 --- a/ch16/ch16_part2.ipynb +++ b/ch16/ch16_part2.ipynb @@ -4,10 +4,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Chapter 16 \n", - "====\n", + "*Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com) & [Vahid Mirjalili](http://vahidmirjalili.com), Packt Publishing Ltd. 2019\n", "\n", - "## Project two: character-level language modeling in TensorFlow" + "Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition\n", + "\n", + "Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 2/2)\n", + "========\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Project two: character-level language modeling in TensorFlow\n", + "\n", + "### Preprocessing the dataset" ] }, { @@ -209,11 +228,11 @@ ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### Building a character-level RNN model" + ] }, { "cell_type": "code", @@ -328,7 +347,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Sampling" + "### Evaluation phase: generating new text passages" ] }, { @@ -449,7 +468,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Predictability vs. randomness" + "* **Predictability vs. randomness**" ] }, { @@ -539,6 +558,61 @@ " scale_factor=0.5))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Understanding language with the Transformer model\n", + "\n", + "## Understanding the self-attention mechanism\n", + "\n", + "## A basic version of self-attention\n", + "\n", + "### Parameterizing the self-attention mechanism with query, key, and value weights\n", + "\n", + "## Multi-head attention and the Transformer block" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "...\n", + "\n", + "\n", + "# Summary\n", + "\n", + "...\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Readers may ignore the next cell.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook ch16_part2.ipynb to script\n", + "[NbConvertApp] Writing 5753 bytes to ch16_part2.py\n" + ] + } + ], + "source": [ + "! python ../.convert_notebook_to_script.py --input ch16_part2.ipynb --output ch16_part2.py" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/ch16/ch16_part2.py b/ch16/ch16_part2.py new file mode 100644 index 00000000..03bcd774 --- /dev/null +++ b/ch16/ch16_part2.py @@ -0,0 +1,273 @@ +# coding: utf-8 + + +import numpy as np +import tensorflow as tf + +# *Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com) & [Vahid Mirjalili](http://vahidmirjalili.com), Packt Publishing Ltd. 2019 +# +# Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition +# +# Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt) + +# Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 2/2) +# ======== +# +# + +# ## Project two: character-level language modeling in TensorFlow +# +# ### Preprocessing the dataset + + + + + + + + +## Reading and processing text +with open('1268-0.txt', 'r') as fp: + text=fp.read() + +start_indx = text.find('THE MYSTERIOUS ISLAND') +end_indx = text.find('End of the Project Gutenberg') +print(start_indx, end_indx) + +text = text[start_indx:end_indx] +char_set = set(text) +print('Total Length: ', len(text)) +print('Unique Characters: ', len(char_set)) + + + + +chars_sorted = sorted(char_set) +char2int = {ch:i for i,ch in enumerate(chars_sorted)} +char_array = np.array(chars_sorted) + +text_encoded = np.array( + [char2int[ch] for ch in text], + dtype=np.int32) + +print('Text encoded shape: ', text_encoded.shape) + +print(text[:15], ' == Encoding ==> ', text_encoded[:15]) +print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]])) + + + + + +ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded) + +for ex in ds_text_encoded.take(5): + print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()])) + + + + +seq_length = 40 +chunk_size = seq_length + 1 + +ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True) + +## inspection: +for seq in ds_chunks.take(1): + input_seq = seq[:seq_length].numpy() + target = seq[seq_length].numpy() + print(input_seq, ' -> ', target) + print(repr(''.join(char_array[input_seq])), + ' -> ', repr(''.join(char_array[target]))) + + + + +## define the function for splitting x & y +def split_input_target(chunk): + input_seq = chunk[:-1] + target_seq = chunk[1:] + return input_seq, target_seq + +ds_sequences = ds_chunks.map(split_input_target) + +## inspection: +for example in ds_sequences.take(2): + print(' Input (x): ', repr(''.join(char_array[example[0].numpy()]))) + print('Target (y): ', repr(''.join(char_array[example[1].numpy()]))) + print() + + + + +# Batch size +BATCH_SIZE = 64 +BUFFER_SIZE = 10000 + +tf.random.set_seed(1) +ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)# drop_remainder=True) + +ds + + +# ### Building a character-level RNN model + + + +def build_model(vocab_size, embedding_dim, rnn_units): + model = tf.keras.Sequential([ + tf.keras.layers.Embedding(vocab_size, embedding_dim), + tf.keras.layers.LSTM( + rnn_units, return_sequences=True), + tf.keras.layers.Dense(vocab_size) + ]) + return model + + +charset_size = len(char_array) +embedding_dim = 256 +rnn_units = 512 + +tf.random.set_seed(1) + +model = build_model( + vocab_size = charset_size, + embedding_dim=embedding_dim, + rnn_units=rnn_units) + +model.summary() + + + + +model.compile( + optimizer='adam', + loss=tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True + )) + +model.fit(ds, epochs=20) + + +# ### Evaluation phase: generating new text passages + + + +tf.random.set_seed(1) + +logits = [[1.0, 1.0, 1.0]] +print('Probabilities: ', tf.math.softmax(logits).numpy()[0]) + +samples = tf.random.categorical( + logits=logits, num_samples=10) +tf.print(samples.numpy()) + + + + +tf.random.set_seed(1) + +logits = [[1.0, 1.0, 3.0]] +print('Probabilities: ', tf.math.softmax(logits).numpy()[0]) + +samples = tf.random.categorical( + logits=logits, num_samples=10) +tf.print(samples.numpy()) + + + + +def sample(model, starting_str, + len_generated_text=500, + max_input_length=40, + scale_factor=1.0): + encoded_input = [char2int[s] for s in starting_str] + encoded_input = tf.reshape(encoded_input, (1, -1)) + + generated_str = starting_str + + model.reset_states() + for i in range(len_generated_text): + logits = model(encoded_input) + logits = tf.squeeze(logits, 0) + + scaled_logits = logits * scale_factor + new_char_indx = tf.random.categorical( + scaled_logits, num_samples=1) + + new_char_indx = tf.squeeze(new_char_indx)[-1].numpy() + + generated_str += str(char_array[new_char_indx]) + + new_char_indx = tf.expand_dims([new_char_indx], 0) + encoded_input = tf.concat( + [encoded_input, new_char_indx], + axis=1) + encoded_input = encoded_input[:, -max_input_length:] + + return generated_str + +tf.random.set_seed(1) +print(sample(model, starting_str="The island", + scale_factor=1.0)) + + +# * **Predictability vs. randomness** + + + +logits = np.array([[1.0, 1.0, 3.0]]) + +print('Probabilities before scaling: ', tf.math.softmax(logits).numpy()[0]) + +print('Probabilities after scaling with 0.5:', tf.math.softmax(0.5*logits).numpy()[0]) + +print('Probabilities after scaling with 0.1:', tf.math.softmax(0.1*logits).numpy()[0]) + + + + +tf.random.set_seed(1) +print(sample(model, starting_str="The island", + scale_factor=2.0)) + + + + +tf.random.set_seed(1) +print(sample(model, starting_str="The island", + scale_factor=0.5)) + + +# # Understanding language with the Transformer model +# +# ## Understanding the self-attention mechanism +# +# ## A basic version of self-attention +# +# ### Parameterizing the self-attention mechanism with query, key, and value weights +# +# ## Multi-head attention and the Transformer block + +# +# ... +# +# +# # Summary +# +# ... +# + +# +# +# Readers may ignore the next cell. +# + + + + + + + + +