diff --git a/.gitignore b/.gitignore
index 51cf15f2..aae28529 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# Ch16 data files
+ch16/models/
+ch16/movie_data.csv
+
# Ch15 data files
ch15/models/
diff --git a/ch16/ch16_part1.ipynb b/ch16/ch16_part1.ipynb
index fccd2741..8cc8c163 100644
--- a/ch16/ch16_part1.ipynb
+++ b/ch16/ch16_part1.ipynb
@@ -15,8 +15,39 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 1/2)\n",
- "========\n"
+ "# Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (Part 1/2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sebastian Raschka & Vahid Mirjalili \n",
+ "last updated: 2019-11-03 \n",
+ "\n",
+ "numpy 1.17.2\n",
+ "scipy 1.2.1\n",
+ "matplotlib 3.1.0\n",
+ "tensorflow 2.0.0\n",
+ "tensorflow_datasets 1.3.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext watermark\n",
+ "%watermark -a \"Sebastian Raschka & Vahid Mirjalili\" -u -d -p numpy,scipy,matplotlib,tensorflow,tensorflow_datasets"
]
},
{
@@ -47,16 +78,16 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "W_xh shape: (5, 2)\n",
- "W_oo shape: (2, 2)\n",
- "b_h shape: (2,)\n"
+ "W_xh shape: (5, 2)\n",
+ "W_oo shape: (2, 2)\n",
+ "b_h shape: (2,)\n"
]
}
],
@@ -71,14 +102,14 @@
"\n",
"w_xh, w_oo, b_h = rnn_layer.weights\n",
"\n",
- "print('W_xh shape: ', w_xh.shape)\n",
- "print('W_oo shape: ', w_oo.shape)\n",
- "print('b_h shape: ', b_h.shape)"
+ "print('W_xh shape:', w_xh.shape)\n",
+ "print('W_oo shape:', w_oo.shape)\n",
+ "print('b_h shape:', b_h.shape)"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -86,22 +117,22 @@
"output_type": "stream",
"text": [
"Time step 0 =>\n",
- " Input : [[1. 1. 1. 1. 1.]]\n",
- " Hidden : [[0.41464037 0.96012145]]\n",
- " Output (manual) : [[0.39240566 0.74433106]]\n",
- " SimpleRNN output: [0.39240566 0.74433106]\n",
+ " Input : [[1. 1. 1. 1. 1.]]\n",
+ " Hidden : [[0.41464037 0.96012145]]\n",
+ " Output (manual) : [[0.39240566 0.74433106]]\n",
+ " SimpleRNN output: [0.39240566 0.74433106]\n",
"\n",
"Time step 1 =>\n",
- " Input : [[2. 2. 2. 2. 2.]]\n",
- " Hidden : [[0.82928073 1.9202429 ]]\n",
- " Output (manual) : [[0.80116504 0.9912947 ]]\n",
- " SimpleRNN output: [0.80116504 0.9912947 ]\n",
+ " Input : [[2. 2. 2. 2. 2.]]\n",
+ " Hidden : [[0.82928073 1.9202429 ]]\n",
+ " Output (manual) : [[0.80116504 0.9912947 ]]\n",
+ " SimpleRNN output: [0.80116504 0.9912947 ]\n",
"\n",
"Time step 2 =>\n",
- " Input : [[3. 3. 3. 3. 3.]]\n",
- " Hidden : [[1.243921 2.8803642]]\n",
- " Output (manual) : [[0.95468265 0.9993069 ]]\n",
- " SimpleRNN output: [0.95468265 0.9993069 ]\n",
+ " Input : [[3. 3. 3. 3. 3.]]\n",
+ " Hidden : [[1.243921 2.8803642]]\n",
+ " Output (manual) : [[0.95468265 0.9993069 ]]\n",
+ " SimpleRNN output: [0.95468265 0.9993069 ]\n",
"\n"
]
}
@@ -120,10 +151,10 @@
"for t in range(len(x_seq)):\n",
" xt = tf.reshape(x_seq[t], (1, 5))\n",
" print('Time step {} =>'.format(t))\n",
- " print(' Input : ', xt.numpy())\n",
+ " print(' Input :', xt.numpy())\n",
" \n",
" ht = tf.matmul(xt, w_xh) + b_h \n",
- " print(' Hidden : ', ht.numpy())\n",
+ " print(' Hidden :', ht.numpy())\n",
" \n",
" if t>0:\n",
" prev_o = out_man[t-1]\n",
@@ -133,8 +164,8 @@
" ot = ht + tf.matmul(prev_o, w_oo)\n",
" ot = tf.math.tanh(ot)\n",
" out_man.append(ot)\n",
- " print(' Output (manual) : ', ot.numpy())\n",
- " print(' SimpleRNN output: '.format(t), output[0][t].numpy())\n",
+ " print(' Output (manual) :', ot.numpy())\n",
+ " print(' SimpleRNN output:'.format(t), output[0][t].numpy())\n",
" print()"
]
},
@@ -161,7 +192,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -173,7 +204,22 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import gzip\n",
+ "import shutil\n",
+ "\n",
+ "\n",
+ "with gzip.open('../ch08/movie_data.csv.gz', 'rb') as f_in, open('movie_data.csv', 'wb') as f_out:\n",
+ " shutil.copyfileobj(f_in, f_out)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -203,27 +249,27 @@
" \n",
"
\n",
" \n",
- " 49995 | \n",
+ " 49995 | \n",
" OK, lets start with the best. the building. al... | \n",
" 0 | \n",
"
\n",
" \n",
- " 49996 | \n",
+ " 49996 | \n",
" The British 'heritage film' industry is out of... | \n",
" 0 | \n",
"
\n",
" \n",
- " 49997 | \n",
+ " 49997 | \n",
" I don't even know where to begin on this one. ... | \n",
" 0 | \n",
"
\n",
" \n",
- " 49998 | \n",
+ " 49998 | \n",
" Richard Tyler is a little boy who is scared of... | \n",
" 0 | \n",
"
\n",
" \n",
- " 49999 | \n",
+ " 49999 | \n",
" I waited long to watch this movie. Also becaus... | \n",
" 1 | \n",
"
\n",
@@ -240,7 +286,7 @@
"49999 I waited long to watch this movie. Also becaus... 1"
]
},
- "execution_count": 3,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -248,12 +294,12 @@
"source": [
"df = pd.read_csv('movie_data.csv', encoding='utf-8')\n",
"\n",
- "df.tail()\n"
+ "df.tail()"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -288,7 +334,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -321,14 +367,14 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Vocab-size: 87007\n"
+ "Vocab-size: 87007\n"
]
}
],
@@ -344,12 +390,12 @@
" tokens = tokenizer.tokenize(example[0].numpy()[0])\n",
" token_counts.update(tokens)\n",
" \n",
- "print('Vocab-size: ', len(token_counts))"
+ "print('Vocab-size:', len(token_counts))"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -358,7 +404,7 @@
"[232, 9, 270, 1123]"
]
},
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -374,12 +420,12 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "\n",
"## Step 3-A: define the function for transformation\n",
+ "\n",
"def encode(text_tensor, label):\n",
" text = text_tensor.numpy()[0]\n",
" encoded_text = encoder.encode(text)\n",
@@ -388,29 +434,29 @@
"## Step 3-B: wrap the encode function to a TF Op.\n",
"def encode_map_fn(text, label):\n",
" return tf.py_function(encode, inp=[text, label], \n",
- " Tout=(tf.int64, tf.int64))\n"
+ " Tout=(tf.int64, tf.int64))"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Sequence length: (24,)\n",
- "Sequence length: (179,)\n",
- "Sequence length: (262,)\n",
- "Sequence length: (535,)\n",
- "Sequence length: (130,)\n"
+ "Sequence length: (24,)\n",
+ "Sequence length: (179,)\n",
+ "Sequence length: (262,)\n",
+ "Sequence length: (535,)\n",
+ "Sequence length: (130,)\n"
]
},
{
"data": {
"text/plain": [
- "(,\n",
- " )"
+ " )"
]
},
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -441,7 +487,7 @@
"\n",
"tf.random.set_seed(1)\n",
"for example in ds_train.shuffle(1000).take(5):\n",
- " print('Sequence length: ', example[0].shape)\n",
+ " print('Sequence length:', example[0].shape)\n",
" \n",
"example"
]
@@ -454,20 +500,14 @@
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "# ###### ### ### # ### \n",
- "# # # # # # # # # # \n",
- "# # # # # # # # # # \n",
- "# ##### ##### ##### # # ##### \n",
- "# # # # # # # # # # \n",
- "# # # # # # # # # # \n",
- "# ###### # # # # # # \n",
- "#\n",
+ "```python\n",
+ "\n",
"# this will result in error\n",
+ "\n",
+ "\n",
"BATCH_SIZE = 32\n",
"train_data = all_encoded_data.batch(BATCH_SIZE)\n",
"\n",
@@ -475,55 +515,49 @@
"\n",
"# Running this will result in error\n",
"# We cannot apply .batch() to this dataset\n",
- "\n",
- "# ###### ### ### # ### \n",
- "# # # # # # # # # # \n",
- "# # # # # # # # # # \n",
- "# ##### ##### ##### # # ##### \n",
- "# # # # # # # # # # \n",
- "# # # # # # # # # # \n",
- "# ###### # # # # # # "
+ "```"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Individual Shape: (119,)\n",
- "Individual Shape: (688,)\n",
- "Individual Shape: (308,)\n",
- "Individual Shape: (204,)\n",
- "Individual Shape: (326,)\n",
- "Individual Shape: (240,)\n",
- "Individual Shape: (127,)\n",
- "Individual Shape: (453,)\n",
- "Batch Shape: (4, 688) #columns: 688\n",
- "Batch Shape: (4, 453) #columns: 453\n"
+ "Individual Shape: (119,)\n",
+ "Individual Shape: (688,)\n",
+ "Individual Shape: (308,)\n",
+ "Individual Shape: (204,)\n",
+ "Individual Shape: (326,)\n",
+ "Individual Shape: (240,)\n",
+ "Individual Shape: (127,)\n",
+ "Individual Shape: (453,)\n",
+ "Batch Shape: (4, 688)\n",
+ "Batch Shape: (4, 453)\n"
]
}
],
"source": [
"## Take a small subset\n",
+ "\n",
"ds_subset = ds_train.take(8)\n",
"for example in ds_subset:\n",
- " print('Individual Shape: ', example[0].shape)\n",
+ " print('Individual Shape:', example[0].shape)\n",
"\n",
"## batching the datasets\n",
"ds_batched = ds_subset.padded_batch(\n",
" 4, padded_shapes=([-1], []))\n",
"\n",
"for batch in ds_batched:\n",
- " print('Batch Shape: ', batch[0].shape)"
+ " print('Batch Shape:', batch[0].shape)"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -535,7 +569,7 @@
" 32, padded_shapes=([-1],[]))\n",
"\n",
"test_data = ds_test.padded_batch(\n",
- " 32, padded_shapes=([-1],[]))\n"
+ " 32, padded_shapes=([-1],[]))"
]
},
{
@@ -561,7 +595,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -584,6 +618,7 @@
"source": [
"from tensorflow.keras.layers import Embedding\n",
"\n",
+ "\n",
"model = tf.keras.Sequential()\n",
"\n",
"model.add(Embedding(input_dim=100,\n",
@@ -617,7 +652,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -628,11 +663,11 @@
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
- "embedding_1 (Embedding) (None, None, 32) 32000 \n",
+ "embedding (Embedding) (None, None, 32) 32000 \n",
"_________________________________________________________________\n",
- "simple_rnn (SimpleRNN) (None, None, 32) 2080 \n",
+ "simple_rnn_1 (SimpleRNN) (None, None, 32) 2080 \n",
"_________________________________________________________________\n",
- "simple_rnn_1 (SimpleRNN) (None, 32) 2080 \n",
+ "simple_rnn_2 (SimpleRNN) (None, 32) 2080 \n",
"_________________________________________________________________\n",
"dense (Dense) (None, 1) 33 \n",
"=================================================================\n",
@@ -646,6 +681,7 @@
"source": [
"## An example of building a RNN model\n",
"## with SimpleRNN layer\n",
+ "\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Embedding\n",
"from tensorflow.keras.layers import SimpleRNN\n",
@@ -656,12 +692,12 @@
"model.add(SimpleRNN(32, return_sequences=True))\n",
"model.add(SimpleRNN(32))\n",
"model.add(Dense(1))\n",
- "model.summary()\n"
+ "model.summary()"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -672,7 +708,7 @@
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
- "embedding_2 (Embedding) (None, None, 32) 320000 \n",
+ "embedding_1 (Embedding) (None, None, 32) 320000 \n",
"_________________________________________________________________\n",
"lstm (LSTM) (None, None, 32) 8320 \n",
"_________________________________________________________________\n",
@@ -690,22 +726,22 @@
"source": [
"## An example of building a RNN model\n",
"## with LSTM layer\n",
- "from tensorflow.keras import Sequential\n",
- "from tensorflow.keras.layers import Embedding\n",
+ "\n",
+ "\n",
"from tensorflow.keras.layers import LSTM\n",
- "from tensorflow.keras.layers import Dense\n",
+ "\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(10000, 32))\n",
"model.add(LSTM(32, return_sequences=True))\n",
"model.add(LSTM(32))\n",
"model.add(Dense(1))\n",
- "model.summary()\n"
+ "model.summary()"
]
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -716,7 +752,7 @@
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
- "embedding_3 (Embedding) (None, None, 32) 320000 \n",
+ "embedding_2 (Embedding) (None, None, 32) 320000 \n",
"_________________________________________________________________\n",
"gru (GRU) (None, None, 32) 6336 \n",
"_________________________________________________________________\n",
@@ -734,17 +770,14 @@
"source": [
"## An example of building a RNN model\n",
"## with GRU layer\n",
- "from tensorflow.keras import Sequential\n",
- "from tensorflow.keras.layers import Embedding\n",
"from tensorflow.keras.layers import GRU\n",
- "from tensorflow.keras.layers import Dense\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(10000, 32))\n",
"model.add(GRU(32, return_sequences=True))\n",
"model.add(GRU(32))\n",
"model.add(Dense(1))\n",
- "model.summary()\n"
+ "model.summary()"
]
},
{
@@ -756,7 +789,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
@@ -780,32 +813,31 @@
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"Epoch 1/10\n",
- "625/625 [==============================] - 380s 608ms/step - loss: 0.5173 - accuracy: 0.7376 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
+ "625/625 [==============================] - 268s 429ms/step - loss: 0.5440 - accuracy: 0.7143 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
"Epoch 2/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.2747 - accuracy: 0.8955 - val_loss: 0.3982 - val_accuracy: 0.8516\n",
+ "625/625 [==============================] - 262s 419ms/step - loss: 0.2753 - accuracy: 0.8903 - val_loss: 0.3489 - val_accuracy: 0.8634\n",
"Epoch 3/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.1528 - accuracy: 0.9478 - val_loss: 0.4270 - val_accuracy: 0.8588\n",
+ "625/625 [==============================] - 262s 420ms/step - loss: 0.1378 - accuracy: 0.9514 - val_loss: 0.4153 - val_accuracy: 0.8434\n",
"Epoch 4/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0958 - accuracy: 0.9674 - val_loss: 0.6082 - val_accuracy: 0.8358\n",
+ "625/625 [==============================] - 263s 420ms/step - loss: 0.0685 - accuracy: 0.9778 - val_loss: 0.4895 - val_accuracy: 0.8450\n",
"Epoch 5/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0551 - accuracy: 0.9836 - val_loss: 0.6766 - val_accuracy: 0.8468\n",
+ "625/625 [==============================] - 263s 420ms/step - loss: 0.0470 - accuracy: 0.9861 - val_loss: 0.5602 - val_accuracy: 0.8228\n",
"Epoch 6/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0454 - accuracy: 0.9858 - val_loss: 0.6429 - val_accuracy: 0.8398\n",
+ "625/625 [==============================] - 263s 420ms/step - loss: 0.0383 - accuracy: 0.9884 - val_loss: 0.6552 - val_accuracy: 0.8200\n",
"Epoch 7/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0418 - accuracy: 0.9870 - val_loss: 0.6093 - val_accuracy: 0.8388\n",
+ "625/625 [==============================] - 262s 420ms/step - loss: 0.0294 - accuracy: 0.9916 - val_loss: 0.6579 - val_accuracy: 0.8380\n",
"Epoch 8/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0288 - accuracy: 0.9908 - val_loss: 0.7021 - val_accuracy: 0.8356\n",
+ "625/625 [==============================] - 262s 420ms/step - loss: 0.0321 - accuracy: 0.9894 - val_loss: 0.8191 - val_accuracy: 0.8182\n",
"Epoch 9/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0308 - accuracy: 0.9909 - val_loss: 0.7601 - val_accuracy: 0.8392\n",
+ "625/625 [==============================] - 263s 420ms/step - loss: 0.0552 - accuracy: 0.9809 - val_loss: 0.7586 - val_accuracy: 0.8496\n",
"Epoch 10/10\n",
- "625/625 [==============================] - 365s 584ms/step - loss: 0.0205 - accuracy: 0.9931 - val_loss: 0.8158 - val_accuracy: 0.8388\n",
- "782/782 [==============================] - 140s 179ms/step - loss: 0.8131 - accuracy: 0.8392\n",
- "Test Acc.: 83.92%\n"
+ "625/625 [==============================] - 262s 420ms/step - loss: 0.0252 - accuracy: 0.9921 - val_loss: 0.7784 - val_accuracy: 0.8474\n",
+ "782/782 [==============================] - 100s 128ms/step - loss: 0.7809 - accuracy: 0.8437\n",
+ "Test Acc.: 84.37%\n"
]
}
],
"source": [
- "\n",
"embedding_dim = 20\n",
"vocab_size = len(token_counts) + 2\n",
"\n",
@@ -847,10 +879,14 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
+ "if not os.path.exists('models'):\n",
+ " os.mkdir('models')\n",
+ "\n",
+ "\n",
"bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.h5')"
]
},
@@ -863,12 +899,10 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
- "from collections import Counter\n",
- "\n",
"def preprocess_datasets(\n",
" ds_raw_train, \n",
" ds_raw_valid, \n",
@@ -887,7 +921,7 @@
" tokens = tokens[-max_seq_length:]\n",
" token_counts.update(tokens)\n",
"\n",
- " print('Vocab-size: ', len(token_counts))\n",
+ " print('Vocab-size:', len(token_counts))\n",
"\n",
"\n",
" ## Step 3: encoding the texts\n",
@@ -923,17 +957,10 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
- "from tensorflow.keras.layers import Embedding\n",
- "from tensorflow.keras.layers import Bidirectional\n",
- "from tensorflow.keras.layers import SimpleRNN\n",
- "from tensorflow.keras.layers import LSTM\n",
- "from tensorflow.keras.layers import GRU\n",
- "\n",
- "\n",
"def build_rnn_model(embedding_dim, vocab_size,\n",
" recurrent_type='SimpleRNN',\n",
" n_recurrent_units=64,\n",
@@ -985,15 +1012,15 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Vocab-size: 58063\n",
- "Model: \"sequential_5\"\n",
+ "Vocab-size: 58063\n",
+ "Model: \"sequential_6\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
@@ -1013,6 +1040,8 @@
}
],
"source": [
+ "from tensorflow.keras.layers import Bidirectional\n",
+ "\n",
"\n",
"batch_size = 32\n",
"embedding_dim = 20\n",
@@ -1039,7 +1068,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
@@ -1047,25 +1076,25 @@
"output_type": "stream",
"text": [
"Epoch 1/10\n",
- "625/625 [==============================] - 71s 113ms/step - loss: 0.7055 - accuracy: 0.5026 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
+ "625/625 [==============================] - 54s 87ms/step - loss: 0.6997 - accuracy: 0.5013 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
"Epoch 2/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.7037 - accuracy: 0.5113 - val_loss: 0.6953 - val_accuracy: 0.5072\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.6741 - accuracy: 0.5641 - val_loss: 0.7000 - val_accuracy: 0.6032\n",
"Epoch 3/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.6954 - accuracy: 0.5213 - val_loss: 0.6884 - val_accuracy: 0.5452\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.4976 - accuracy: 0.7637 - val_loss: 0.5935 - val_accuracy: 0.6918\n",
"Epoch 4/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.6317 - accuracy: 0.6259 - val_loss: 0.6152 - val_accuracy: 0.6522\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.3362 - accuracy: 0.8600 - val_loss: 0.4991 - val_accuracy: 0.7818\n",
"Epoch 5/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.4913 - accuracy: 0.7629 - val_loss: 0.7798 - val_accuracy: 0.6278\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.2168 - accuracy: 0.9157 - val_loss: 0.4592 - val_accuracy: 0.8118\n",
"Epoch 6/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.4252 - accuracy: 0.8094 - val_loss: 0.6105 - val_accuracy: 0.7432\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.1549 - accuracy: 0.9422 - val_loss: 0.6382 - val_accuracy: 0.7686\n",
"Epoch 7/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.4100 - accuracy: 0.8122 - val_loss: 0.6718 - val_accuracy: 0.7016\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.0899 - accuracy: 0.9696 - val_loss: 0.6741 - val_accuracy: 0.7848\n",
"Epoch 8/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.3749 - accuracy: 0.8281 - val_loss: 0.7762 - val_accuracy: 0.6348\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.0659 - accuracy: 0.9777 - val_loss: 0.6549 - val_accuracy: 0.8022\n",
"Epoch 9/10\n",
- "625/625 [==============================] - 69s 110ms/step - loss: 0.2711 - accuracy: 0.8813 - val_loss: 0.7503 - val_accuracy: 0.6892\n",
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.0710 - accuracy: 0.9758 - val_loss: 0.6970 - val_accuracy: 0.7830\n",
"Epoch 10/10\n",
- "625/625 [==============================] - 69s 111ms/step - loss: 0.1665 - accuracy: 0.9367 - val_loss: 0.7172 - val_accuracy: 0.7454\n"
+ "625/625 [==============================] - 52s 83ms/step - loss: 0.0476 - accuracy: 0.9839 - val_loss: 0.7840 - val_accuracy: 0.8038\n"
]
}
],
@@ -1083,41 +1112,31 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "782/782 [==============================] - 44s 57ms/step - loss: 1.2369 - accuracy: 0.7341\n"
+ "782/782 [==============================] - 28s 36ms/step - loss: 0.7896 - accuracy: 0.8070\n"
]
- },
- {
- "data": {
- "text/plain": [
- "[1.236854262745289, 0.73412]"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
- "rnn_model.evaluate(test_data)"
+ "results = rnn_model.evaluate(test_data)"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Test Acc.: 73.41%\n"
+ "Test Acc.: 80.70%\n"
]
}
],
@@ -1136,15 +1155,15 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Vocab-size: 87007\n",
- "Model: \"sequential_6\"\n",
+ "Vocab-size: 87007\n",
+ "Model: \"sequential_7\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
@@ -1164,7 +1183,6 @@
}
],
"source": [
- "\n",
"batch_size = 32\n",
"embedding_dim = 20\n",
"max_seq_length = None\n",
@@ -1190,7 +1208,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
@@ -1198,25 +1216,25 @@
"output_type": "stream",
"text": [
"Epoch 1/10\n",
- "625/625 [==============================] - 226s 361ms/step - loss: 0.6993 - accuracy: 0.5034 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
+ "625/625 [==============================] - 178s 285ms/step - loss: 0.6991 - accuracy: 0.5019 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
"Epoch 2/10\n",
- "625/625 [==============================] - 224s 359ms/step - loss: 0.6993 - accuracy: 0.5006 - val_loss: 0.6998 - val_accuracy: 0.5010\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6983 - accuracy: 0.4920 - val_loss: 0.6931 - val_accuracy: 0.5108\n",
"Epoch 3/10\n",
- "625/625 [==============================] - 281s 449ms/step - loss: 0.6968 - accuracy: 0.5034 - val_loss: 0.6960 - val_accuracy: 0.5004\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6949 - accuracy: 0.4975 - val_loss: 0.6930 - val_accuracy: 0.5128\n",
"Epoch 4/10\n",
- "625/625 [==============================] - 288s 460ms/step - loss: 0.6958 - accuracy: 0.5038 - val_loss: 0.6953 - val_accuracy: 0.5002\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6945 - accuracy: 0.5002 - val_loss: 0.6929 - val_accuracy: 0.5126\n",
"Epoch 5/10\n",
- "625/625 [==============================] - 287s 459ms/step - loss: 0.6952 - accuracy: 0.5045 - val_loss: 0.6947 - val_accuracy: 0.4952\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6944 - accuracy: 0.5058 - val_loss: 0.6936 - val_accuracy: 0.5056\n",
"Epoch 6/10\n",
- "625/625 [==============================] - 287s 459ms/step - loss: 0.6945 - accuracy: 0.5080 - val_loss: 0.6945 - val_accuracy: 0.4956\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6959 - accuracy: 0.5045 - val_loss: 0.6932 - val_accuracy: 0.5118\n",
"Epoch 7/10\n",
- "625/625 [==============================] - 287s 459ms/step - loss: 0.6940 - accuracy: 0.5109 - val_loss: 0.6945 - val_accuracy: 0.4960\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6943 - accuracy: 0.5029 - val_loss: 0.6932 - val_accuracy: 0.5124\n",
"Epoch 8/10\n",
- "625/625 [==============================] - 288s 461ms/step - loss: 0.6936 - accuracy: 0.5110 - val_loss: 0.6963 - val_accuracy: 0.4968\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6937 - accuracy: 0.5059 - val_loss: 0.6932 - val_accuracy: 0.5128\n",
"Epoch 9/10\n",
- "625/625 [==============================] - 241s 386ms/step - loss: 0.6934 - accuracy: 0.5101 - val_loss: 0.6948 - val_accuracy: 0.4960\n",
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6934 - accuracy: 0.5053 - val_loss: 0.6930 - val_accuracy: 0.5124\n",
"Epoch 10/10\n",
- "625/625 [==============================] - 224s 358ms/step - loss: 0.6935 - accuracy: 0.5096 - val_loss: 0.6936 - val_accuracy: 0.4986\n"
+ "625/625 [==============================] - 177s 283ms/step - loss: 0.6931 - accuracy: 0.5088 - val_loss: 0.6930 - val_accuracy: 0.5130\n"
]
}
],
@@ -1231,18 +1249,6 @@
" epochs=10)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": []
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -1254,137 +1260,1526 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### A- An alterntaive way to get the dataset: using tensorflow_datasets"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "imdb_bldr = tfds.builder('imdb_reviews')\n",
- "print(imdb_bldr.info)\n",
- "\n",
- "imdb_bldr.download_and_prepare()\n",
- "\n",
- "datasets = imdb_bldr.as_dataset(shuffle_files=False)\n",
- "\n",
- "datasets.keys()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### B- Tokenizer and Encoder\n",
- "\n",
- " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n",
- " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder\n",
- "\n"
+ "### A -- An alterntaive way to get the dataset: using tensorflow_datasets"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "[1, 3, 2, 4]\n",
- "[1, 3, 2, 4, 5, 5, 5, 5, 5, 5]\n"
+ "tfds.core.DatasetInfo(\n",
+ " name='imdb_reviews',\n",
+ " version=0.1.0,\n",
+ " description='Large Movie Review Dataset.\n",
+ "This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',\n",
+ " urls=['http://ai.stanford.edu/~amaas/data/sentiment/'],\n",
+ " features=FeaturesDict({\n",
+ " 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),\n",
+ " 'text': Text(shape=(), dtype=tf.string),\n",
+ " }),\n",
+ " total_num_examples=100000,\n",
+ " splits={\n",
+ " 'test': 25000,\n",
+ " 'train': 25000,\n",
+ " 'unsupervised': 50000,\n",
+ " },\n",
+ " supervised_keys=('text', 'label'),\n",
+ " citation=\"\"\"@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n",
+ " author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n",
+ " title = {Learning Word Vectors for Sentiment Analysis},\n",
+ " booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n",
+ " month = {June},\n",
+ " year = {2011},\n",
+ " address = {Portland, Oregon, USA},\n",
+ " publisher = {Association for Computational Linguistics},\n",
+ " pages = {142--150},\n",
+ " url = {http://www.aclweb.org/anthology/P11-1015}\n",
+ " }\"\"\",\n",
+ " redistribution_info=,\n",
+ ")\n",
+ "\n",
+ "\u001b[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /home/raschka/tensorflow_datasets/imdb_reviews/plain_text/0.1.0...\u001b[0m\n"
]
- }
- ],
- "source": [
- "vocab_set = {'a', 'b', 'c', 'd'}\n",
- "encoder = tfds.features.text.TokenTextEncoder(vocab_set)\n",
- "print(encoder)\n",
- "\n",
- "print(encoder.encode(b'a b c d, , : .'))\n",
- "\n",
- "print(encoder.encode(b'a b c d e f g h i z'))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### C- Text Pre-processing with Keras "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "TOP_K = 200\n",
- "MAX_LEN = 10\n",
- "\n",
- "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
- "\n",
- "tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])\n",
- "sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])\n",
- "print(sequences)\n",
- "\n",
- "tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "####\n",
- "TOP_K = 20000\n",
- "MAX_LEN = 500\n",
- "\n",
- "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
- "\n",
- "tokenizer.fit_on_texts(\n",
- " [example['text'].numpy().decode('utf-8') \n",
- " for example in imdb_train])\n",
- "\n",
- "x_train = tokenizer.texts_to_sequences(\n",
- " [example['text'].numpy().decode('utf-8')\n",
- " for example in imdb_train])\n",
- "\n",
- "print(len(x_train))\n",
- "\n",
- "\n",
- "x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(\n",
- " x_train, maxlen=MAX_LEN)\n",
- "\n",
- "print(x_train_padded.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### D- Embedding\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "de79e74b2c204490bac94be76c59f0a5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "538b24048f2a4307b2570bbcd6b4ab00",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]\n",
- " [-0.00420691818 -0.0375086069 -0.00477621704 0.00311584398]\n",
- " [0.028728161 -0.0440448038 -0.0428906195 -0.019158531]\n",
- " [-0.0248817336 0.0408470519 -0.00285203382 -0.0257614851]\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5be14932e00141e296b58813c2aed3e3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "00f49d8f9d6148fea84e304e312c4a3f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Shuffling...', max=10, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "WARNING:tensorflow:From /home/raschka/miniconda3/lib/python3.7/site-packages/tensorflow_datasets/core/file_format_adapter.py:209: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
+ "Instructions for updating:\n",
+ "Use eager execution and: \n",
+ "`tf.data.TFRecordDataset(path)`\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:tensorflow:From /home/raschka/miniconda3/lib/python3.7/site-packages/tensorflow_datasets/core/file_format_adapter.py:209: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
+ "Instructions for updating:\n",
+ "Use eager execution and: \n",
+ "`tf.data.TFRecordDataset(path)`\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "38d1a86661954cae828ad256d40a92bc",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ccc6d64908494ca69c31c17ed9d3fc8a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9cdb75bbe50c44d7b98ddee86990a056",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f629460148724702a0dc07f39a1036fb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5d2bc744109141e8af87a090b7f0bd79",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f65100cfcff9421891c3aca5f92e6e35",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b0357aa53a274fc6aba4e9383937ce32",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c9b5a917709f4ca58804e20b6e57763e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "aa9565276fec427083574782b7a978b7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7ca461ea11be4af184f432cbb612e5f2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "636e631bbd594aebb0c33111e0ee7d7d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0932df82a331465c8d82cc69dbb0521f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "56b715c451f24fa9bbfe8d7214e89d0e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d2312c56af214db181801c1a35437553",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3d058093762f40beb363ee7c3e282609",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3a9ecefeac964b07bf14b89c21575735",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e641fc32d8cd43a985f4aa8467c3f1c8",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fdf16f70917a4b47b7840b4b6186df4d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c2cdeaf540764609a896ec59e38feaaa",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b0ca0e9241b540139ce759425c71890d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "11bd659783ec45e98be7caba5c11ef18",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "cbb812579b0e446dafefdf5eccdc8612",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Shuffling...', max=10, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5b93a9d9b9c549be95ea73d507693ba5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f32dae170cd449a69ec862d20641133a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fab06d6b0b3c469d8f4051bd96e9aa71",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "71d2f0d35e4e495e8eb6eaa45ec28c32",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d03e6d401cfc4d09a784b3fedc9c4df9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9f158c0c614348029a1573a5ee738f49",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "77c83c93259f48cd96bdc3b9f999ac12",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8ec339e2dfa34ae5936d0f2967ce4ed1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7f21b47ffde14506bb446928510dad38",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "941941e809f7481db75aaac0db3611d1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f9f58fc3f4344318b7bf62710b51d924",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c52c15d2303a4d6cb4ab4a64018d69c5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3a8d6944bff74d36bcd2e17639e7f64b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "47763715394b49618453989273c15a38",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c50a3c53780b4e4297a078ada405734a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b175e9e1fc074ccb9df408b1d8c7481a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "45291f47ad2a48f79c707485c2d8994e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a2ca9a9a1a044a30bc234e4b763640d3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a6323fb1eb2847e7b15778840a4df2b1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4402375d75d541ecb87ec1581d0087c0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "885b9473e3d544a3b6b97a2e6cd84210",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c332b6392ce14b63aedf87ab9ba548d9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Shuffling...', max=20, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9f134ac7d9794013acd4de7fd85df3c8",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "703bd98b621c4db8b492b1009ba6b8f2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6795e1b6cc794f6f971b2436787a51f6",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "bb3d851c2a63462c8856c6c58aa1809e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ebe4b3cc57ce4f87a7095e271c481429",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6e9a9b4e0f01488eaf79dcb62906fd28",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2e89ff3e960f4fae9ada9990cbf53fc1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fe268d74efdb4cee9e6283285b390050",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5f10167829704d9aafb8b4908f8598fd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a030df39d3b847649964e0efda3914e1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0f5f4d2d423c46e4aa5267bc9394e9e2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e2ebfe1b9d2a407081a6159adb873613",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2b37cd9626124d669a42162466e49b9b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "22bf09a24a2d4972b82c2caf15bd15bc",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "eeed7bd1d3074be6baaabc7bad706565",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fc32c7db4618406384f739c816bd9cfd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2810227adcf54e479c0eb3a3b8d13b4f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5252aa813ea04294a5fceec21aeed1ea",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "169e22f658ac41d4809753405ccb3df2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "861198bcff3842f1a35544fd982505de",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8fad9d16feb3446191b7a0d8430b5886",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "915c8888fa5b4a24ba3ec935f2924955",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0f378c95e1d448a3b0fab61759b91d69",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c68480a4635e4b7993649df9f50c173f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1d1cc390ed3d4dfd86de7743a39f1e59",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b19fe83393624220a6dace6c965e2830",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c30986d29dc64281b62f35fce1ffdbfb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9916f0608c454c558fe349c8a308388b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0027ed54de6e4010be6d833d38303ebd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f395c711d04f46fc92a97153f2fd0688",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ea071488d6464996981af2af51c043d3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0dd6e0e5e15647d9acc9d8a7096e5ecb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0832cbbf73a94d5ab281b4cb26eba9d6",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "592be5c9161d444d8b81b8be08a879e5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5537626415c94ceea6db81b5043cb861",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3fc229b5336f4b81943586079469dff9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e336e5505bdc48eb91ce493b942f098a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4daf1ba7e9bf4cb4b21948197d8b3e7f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1d89a6af1797401ba8c9b64755bf5de6",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4a147ea88311404488d6601531ba5350",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1mDataset imdb_reviews downloaded and prepared to /home/raschka/tensorflow_datasets/imdb_reviews/plain_text/0.1.0. Subsequent calls will reuse this data.\u001b[0m\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['test', 'train', 'unsupervised'])"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "imdb_bldr = tfds.builder('imdb_reviews')\n",
+ "print(imdb_bldr.info)\n",
+ "\n",
+ "imdb_bldr.download_and_prepare()\n",
+ "\n",
+ "datasets = imdb_bldr.as_dataset(shuffle_files=False)\n",
+ "\n",
+ "datasets.keys()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imdb_train = datasets['train']\n",
+ "imdb_train = datasets['test']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### B -- Tokenizer and Encoder\n",
+ "\n",
+ " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n",
+ " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "[4, 3, 2, 1]\n",
+ "[4, 3, 2, 1, 5, 5, 5, 5, 5, 5]\n"
+ ]
+ }
+ ],
+ "source": [
+ "vocab_set = {'a', 'b', 'c', 'd'}\n",
+ "encoder = tfds.features.text.TokenTextEncoder(vocab_set)\n",
+ "print(encoder)\n",
+ "\n",
+ "print(encoder.encode(b'a b c d, , : .'))\n",
+ "\n",
+ "print(encoder.encode(b'a b c d e f g h i z'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### C -- Text Pre-processing with Keras "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[1, 2, 3, 4], [5, 6, 7, 8]]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, 0, 0, 0, 1, 2, 3, 4],\n",
+ " [0, 0, 0, 0, 0, 0, 5, 6, 7, 8]], dtype=int32)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "TOP_K = 200\n",
+ "MAX_LEN = 10\n",
+ "\n",
+ "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
+ "\n",
+ "tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])\n",
+ "sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])\n",
+ "print(sequences)\n",
+ "\n",
+ "tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "25000\n",
+ "(25000, 500)\n"
+ ]
+ }
+ ],
+ "source": [
+ "TOP_K = 20000\n",
+ "MAX_LEN = 500\n",
+ "\n",
+ "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n",
+ "\n",
+ "tokenizer.fit_on_texts(\n",
+ " [example['text'].numpy().decode('utf-8') \n",
+ " for example in imdb_train])\n",
+ "\n",
+ "x_train = tokenizer.texts_to_sequences(\n",
+ " [example['text'].numpy().decode('utf-8')\n",
+ " for example in imdb_train])\n",
+ "\n",
+ "print(len(x_train))\n",
+ "\n",
+ "\n",
+ "x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(\n",
+ " x_train, maxlen=MAX_LEN)\n",
+ "\n",
+ "print(x_train_padded.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### D -- Embedding\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]\n",
+ " [-0.00420691818 -0.0375086069 -0.00477621704 0.00311584398]\n",
+ " [0.028728161 -0.0440448038 -0.0428906195 -0.019158531]\n",
+ " [-0.0248817336 0.0408470519 -0.00285203382 -0.0257614851]\n",
" [0.0443614833 0.00331580639 0.043055404 -0.011118304]\n",
" [-0.0281324144 0.00720113516 0.0192188732 -0.0186921246]]\n",
"TensorShape([6, 4])\n",
@@ -1395,6 +2790,7 @@
"source": [
"from tensorflow.keras.layers import Embedding\n",
"\n",
+ "\n",
"tf.random.set_seed(1)\n",
"embed = Embedding(input_dim=100, output_dim=4)\n",
"\n",
@@ -1405,6 +2801,16 @@
"tf.print(embed(np.array([1])))"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
\n",
+ "\n",
+ "---"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -1416,7 +2822,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
@@ -1424,20 +2830,13 @@
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook ch16_part1.ipynb to script\n",
- "[NbConvertApp] Writing 16880 bytes to ch16_part1.py\n"
+ "[NbConvertApp] Writing 16576 bytes to ch16_part1.py\n"
]
}
],
"source": [
"! python ../.convert_notebook_to_script.py --input ch16_part1.ipynb --output ch16_part1.py"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -1456,9 +2855,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.7.3"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/ch16/ch16_part1.py b/ch16/ch16_part1.py
index 53dc9056..0a4e6cf8 100644
--- a/ch16/ch16_part1.py
+++ b/ch16/ch16_part1.py
@@ -5,6 +5,9 @@
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
+import os
+import gzip
+import shutil
from collections import Counter
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
@@ -20,9 +23,13 @@
#
# Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt)
-# Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (part 1/2)
-# ========
-#
+# # Chapter 16: Modeling Sequential Data Using Recurrent Neural Networks (Part 1/2)
+
+# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
+
+
+
+
# # Introducing sequential data
#
@@ -52,9 +59,9 @@
w_xh, w_oo, b_h = rnn_layer.weights
-print('W_xh shape: ', w_xh.shape)
-print('W_oo shape: ', w_oo.shape)
-print('b_h shape: ', b_h.shape)
+print('W_xh shape:', w_xh.shape)
+print('W_oo shape:', w_oo.shape)
+print('b_h shape:', b_h.shape)
@@ -72,10 +79,10 @@
for t in range(len(x_seq)):
xt = tf.reshape(x_seq[t], (1, 5))
print('Time step {} =>'.format(t))
- print(' Input : ', xt.numpy())
+ print(' Input :', xt.numpy())
ht = tf.matmul(xt, w_xh) + b_h
- print(' Hidden : ', ht.numpy())
+ print(' Hidden :', ht.numpy())
if t>0:
prev_o = out_man[t-1]
@@ -85,8 +92,8 @@
ot = ht + tf.matmul(prev_o, w_oo)
ot = tf.math.tanh(ot)
out_man.append(ot)
- print(' Output (manual) : ', ot.numpy())
- print(' SimpleRNN output: '.format(t), output[0][t].numpy())
+ print(' Output (manual) :', ot.numpy())
+ print(' SimpleRNN output:'.format(t), output[0][t].numpy())
print()
@@ -108,6 +115,14 @@
+
+
+with gzip.open('../ch08/movie_data.csv.gz', 'rb') as f_in, open('movie_data.csv', 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+
+
+
+
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.tail()
@@ -160,7 +175,7 @@
tokens = tokenizer.tokenize(example[0].numpy()[0])
token_counts.update(tokens)
-print('Vocab-size: ', len(token_counts))
+print('Vocab-size:', len(token_counts))
@@ -175,8 +190,8 @@
-
## Step 3-A: define the function for transformation
+
def encode(text_tensor, label):
text = text_tensor.numpy()[0]
encoded_text = encoder.encode(text)
@@ -196,54 +211,41 @@ def encode_map_fn(text, label):
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
- print('Sequence length: ', example[0].shape)
+ print('Sequence length:', example[0].shape)
example
# * **batch() vs. padded_batch()**
-
-
-# ###### ### ### # ###
-# # # # # # # # # #
-# # # # # # # # # #
-# ##### ##### ##### # # #####
-# # # # # # # # # #
-# # # # # # # # # #
-# ###### # # # # # #
-#
-# this will result in error
-BATCH_SIZE = 32
-train_data = all_encoded_data.batch(BATCH_SIZE)
-
-next(iter(train_data))
-
-# Running this will result in error
-# We cannot apply .batch() to this dataset
-
-# ###### ### ### # ###
-# # # # # # # # # #
-# # # # # # # # # #
-# ##### ##### ##### # # #####
-# # # # # # # # # #
-# # # # # # # # # #
-# ###### # # # # # #
-
+# ```python
+#
+# # this will result in error
+#
+#
+# BATCH_SIZE = 32
+# train_data = all_encoded_data.batch(BATCH_SIZE)
+#
+# next(iter(train_data))
+#
+# # Running this will result in error
+# # We cannot apply .batch() to this dataset
+# ```
## Take a small subset
+
ds_subset = ds_train.take(8)
for example in ds_subset:
- print('Individual Shape: ', example[0].shape)
+ print('Individual Shape:', example[0].shape)
## batching the datasets
ds_batched = ds_subset.padded_batch(
4, padded_shapes=([-1], []))
for batch in ds_batched:
- print('Batch Shape: ', batch[0].shape)
+ print('Batch Shape:', batch[0].shape)
@@ -278,6 +280,7 @@ def encode_map_fn(text, label):
+
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100,
@@ -309,6 +312,7 @@ def encode_map_fn(text, label):
## An example of building a RNN model
## with SimpleRNN layer
+
model = Sequential()
model.add(Embedding(1000, 32))
model.add(SimpleRNN(32, return_sequences=True))
@@ -322,6 +326,9 @@ def encode_map_fn(text, label):
## An example of building a RNN model
## with LSTM layer
+
+
+
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(32, return_sequences=True))
@@ -347,7 +354,6 @@ def encode_map_fn(text, label):
-
embedding_dim = 20
vocab_size = len(token_counts) + 2
@@ -389,6 +395,10 @@ def encode_map_fn(text, label):
+if not os.path.exists('models'):
+ os.mkdir('models')
+
+
bi_lstm_model.save('models/Bidir-LSTM-full-length-seq.h5')
@@ -396,7 +406,6 @@ def encode_map_fn(text, label):
-
def preprocess_datasets(
ds_raw_train,
ds_raw_valid,
@@ -415,7 +424,7 @@ def preprocess_datasets(
tokens = tokens[-max_seq_length:]
token_counts.update(tokens)
- print('Vocab-size: ', len(token_counts))
+ print('Vocab-size:', len(token_counts))
## Step 3: encoding the texts
@@ -451,8 +460,6 @@ def encode_map_fn(text, label):
-
-
def build_rnn_model(embedding_dim, vocab_size,
recurrent_type='SimpleRNN',
n_recurrent_units=64,
@@ -505,6 +512,7 @@ def build_rnn_model(embedding_dim, vocab_size,
+
batch_size = 32
embedding_dim = 20
max_seq_length = 100
@@ -543,7 +551,7 @@ def build_rnn_model(embedding_dim, vocab_size,
-rnn_model.evaluate(test_data)
+results = rnn_model.evaluate(test_data)
@@ -557,7 +565,6 @@ def build_rnn_model(embedding_dim, vocab_size,
-
batch_size = 32
embedding_dim = 20
max_seq_length = None
@@ -593,18 +600,10 @@ def build_rnn_model(embedding_dim, vocab_size,
epochs=10)
-
-
-
-
-
-#
-
# # Appendix
#
-# ### A- An alterntaive way to get the dataset: using tensorflow_datasets
-
+# ### A -- An alterntaive way to get the dataset: using tensorflow_datasets
@@ -618,7 +617,13 @@ def build_rnn_model(embedding_dim, vocab_size,
datasets.keys()
-# ### B- Tokenizer and Encoder
+
+
+imdb_train = datasets['train']
+imdb_train = datasets['test']
+
+
+# ### B -- Tokenizer and Encoder
#
# * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer
# * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder
@@ -636,7 +641,7 @@ def build_rnn_model(embedding_dim, vocab_size,
print(encoder.encode(b'a b c d e f g h i z'))
-# ### C- Text Pre-processing with Keras
+# ### C -- Text Pre-processing with Keras
@@ -654,7 +659,6 @@ def build_rnn_model(embedding_dim, vocab_size,
-####
TOP_K = 20000
MAX_LEN = 500
@@ -677,13 +681,14 @@ def build_rnn_model(embedding_dim, vocab_size,
print(x_train_padded.shape)
-# ### D- Embedding
+# ### D -- Embedding
#
#
+
tf.random.set_seed(1)
embed = Embedding(input_dim=100, output_dim=4)
@@ -694,6 +699,13 @@ def build_rnn_model(embedding_dim, vocab_size,
tf.print(embed(np.array([1])))
+#
+# ---
+
+#
+#
+# Readers may ignore the next cell.
+#
diff --git a/ch16/ch16_part2.ipynb b/ch16/ch16_part2.ipynb
index 1b110d96..e8416852 100644
--- a/ch16/ch16_part2.ipynb
+++ b/ch16/ch16_part2.ipynb
@@ -20,6 +20,38 @@
"\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sebastian Raschka & Vahid Mirjalili \n",
+ "last updated: 2019-11-03 \n",
+ "\n",
+ "numpy 1.17.2\n",
+ "scipy 1.2.1\n",
+ "matplotlib 3.1.0\n",
+ "tensorflow 2.0.0\n",
+ "tensorflow_datasets 1.3.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext watermark\n",
+ "%watermark -a \"Sebastian Raschka & Vahid Mirjalili\" -u -d -p numpy,scipy,matplotlib,tensorflow,tensorflow_datasets"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -31,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -40,7 +72,7 @@
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
- "100 1144k 100 1144k 0 0 2706k 0 --:--:-- --:--:-- --:--:-- 2711k\n"
+ "100 1144k 100 1144k 0 0 3034k 0 --:--:-- --:--:-- --:--:-- 3034k\n"
]
}
],
@@ -58,14 +90,15 @@
"output_type": "stream",
"text": [
"567 1112917\n",
- "Total Length: 1112350\n",
- "Unique Characters: 80\n"
+ "Total Length: 1112350\n",
+ "Unique Characters: 80\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
+ "\n",
"## Reading and processing text\n",
"with open('1268-0.txt', 'r') as fp:\n",
" text=fp.read()\n",
@@ -76,8 +109,8 @@
"\n",
"text = text[start_indx:end_indx]\n",
"char_set = set(text)\n",
- "print('Total Length: ', len(text))\n",
- "print('Unique Characters: ', len(char_set))\n"
+ "print('Total Length:', len(text))\n",
+ "print('Unique Characters:', len(char_set))"
]
},
{
@@ -90,8 +123,8 @@
"output_type": "stream",
"text": [
"Text encoded shape: (1112350,)\n",
- "THE MYSTERIOUS == Encoding ==> [44 32 29 1 37 48 43 44 29 42 33 39 45 43 1]\n",
- "[33 43 36 25 38 28] == Reverse ==> ISLAND\n"
+ "THE MYSTERIOUS == Encoding ==> [44 32 29 1 37 48 43 44 29 42 33 39 45 43 1]\n",
+ "[33 43 36 25 38 28] == Reverse ==> ISLAND\n"
]
}
],
@@ -106,8 +139,8 @@
"\n",
"print('Text encoded shape: ', text_encoded.shape)\n",
"\n",
- "print(text[:15], ' == Encoding ==> ', text_encoded[:15])\n",
- "print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]]))"
+ "print(text[:15], ' == Encoding ==> ', text_encoded[:15])\n",
+ "print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]]))"
]
},
{
@@ -130,6 +163,7 @@
"source": [
"import tensorflow as tf\n",
"\n",
+ "\n",
"ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)\n",
"\n",
"for ex in ds_text_encoded.take(5):\n",
@@ -163,7 +197,7 @@
" target = seq[seq_length].numpy()\n",
" print(input_seq, ' -> ', target)\n",
" print(repr(''.join(char_array[input_seq])), \n",
- " ' -> ', repr(''.join(char_array[target])))\n"
+ " ' -> ', repr(''.join(char_array[target])))"
]
},
{
@@ -175,11 +209,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
- " Input (x): 'THE MYSTERIOUS ISLAND ***\\n\\n\\n\\n\\nProduced b'\n",
- "Target (y): 'HE MYSTERIOUS ISLAND ***\\n\\n\\n\\n\\nProduced by'\n",
+ " Input (x): 'THE MYSTERIOUS ISLAND ***\\n\\n\\n\\n\\nProduced b'\n",
+ "Target (y): 'HE MYSTERIOUS ISLAND ***\\n\\n\\n\\n\\nProduced by'\n",
"\n",
- " Input (x): ' Anthony Matonak, and Trevor Carlson\\n\\n\\n\\n'\n",
- "Target (y): 'Anthony Matonak, and Trevor Carlson\\n\\n\\n\\n\\n'\n",
+ " Input (x): ' Anthony Matonak, and Trevor Carlson\\n\\n\\n\\n'\n",
+ "Target (y): 'Anthony Matonak, and Trevor Carlson\\n\\n\\n\\n\\n'\n",
"\n"
]
}
@@ -195,8 +229,8 @@
"\n",
"## inspection:\n",
"for example in ds_sequences.take(2):\n",
- " print(' Input (x): ', repr(''.join(char_array[example[0].numpy()])))\n",
- " print('Target (y): ', repr(''.join(char_array[example[1].numpy()])))\n",
+ " print(' Input (x):', repr(''.join(char_array[example[0].numpy()])))\n",
+ " print('Target (y):', repr(''.join(char_array[example[1].numpy()])))\n",
" print()"
]
},
@@ -236,22 +270,22 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Model: \"sequential_1\"\n",
+ "Model: \"sequential\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
- "embedding_1 (Embedding) (None, None, 256) 20480 \n",
+ "embedding (Embedding) (None, None, 256) 20480 \n",
"_________________________________________________________________\n",
- "lstm_1 (LSTM) (None, None, 512) 1574912 \n",
+ "lstm (LSTM) (None, None, 512) 1574912 \n",
"_________________________________________________________________\n",
- "dense_1 (Dense) (None, None, 80) 41040 \n",
+ "dense (Dense) (None, None, 80) 41040 \n",
"=================================================================\n",
"Total params: 1,636,432\n",
"Trainable params: 1,636,432\n",
@@ -287,7 +321,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -295,42 +329,56 @@
"output_type": "stream",
"text": [
"Epoch 1/20\n",
- "424/424 [==============================] - 94s 222ms/step - loss: 2.3170\n",
+ "424/424 [==============================] - 69s 162ms/step - loss: 2.3011\n",
"Epoch 2/20\n",
- "424/424 [==============================] - 92s 218ms/step - loss: 1.7389\n",
+ "424/424 [==============================] - 66s 157ms/step - loss: 1.7332\n",
"Epoch 3/20\n",
- "424/424 [==============================] - 93s 219ms/step - loss: 1.5360\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.5343\n",
"Epoch 4/20\n",
- "424/424 [==============================] - 93s 219ms/step - loss: 1.4221\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.4204\n",
"Epoch 5/20\n",
- "424/424 [==============================] - 92s 217ms/step - loss: 1.3495\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.3477\n",
"Epoch 6/20\n",
- "424/424 [==============================] - 92s 217ms/step - loss: 1.2981\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.2976\n",
"Epoch 7/20\n",
- "424/424 [==============================] - 93s 218ms/step - loss: 1.2605\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.2597\n",
"Epoch 8/20\n",
- "424/424 [==============================] - 93s 218ms/step - loss: 1.2305\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.2286\n",
"Epoch 9/20\n",
- "424/424 [==============================] - 93s 219ms/step - loss: 1.2053\n",
+ "424/424 [==============================] - 66s 157ms/step - loss: 1.2030\n",
"Epoch 10/20\n",
- "424/424 [==============================] - 93s 218ms/step - loss: 1.1832\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.1817\n",
"Epoch 11/20\n",
- "424/424 [==============================] - 92s 217ms/step - loss: 1.1634\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.1618\n",
"Epoch 12/20\n",
- "424/424 [==============================] - 92s 217ms/step - loss: 1.1457\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.1446\n",
"Epoch 13/20\n",
- "424/424 [==============================] - 92s 218ms/step - loss: 1.1296\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.1282\n",
"Epoch 14/20\n",
- "424/424 [==============================] - 92s 218ms/step - loss: 1.1142\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.1125\n",
"Epoch 15/20\n",
- "424/424 [==============================] - 93s 219ms/step - loss: 1.1000\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0984\n",
"Epoch 16/20\n",
- "424/424 [==============================] - 92s 216ms/step - loss: 1.0858\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0844\n",
"Epoch 17/20\n",
- "424/424 [==============================] - 93s 219ms/step - loss: 1.0721\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0716\n",
"Epoch 18/20\n",
- "159/424 [==========>...................] - ETA: 59s - loss: 1.0615"
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0588\n",
+ "Epoch 19/20\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0458\n",
+ "Epoch 20/20\n",
+ "424/424 [==============================] - 67s 157ms/step - loss: 1.0335\n"
]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
@@ -359,7 +407,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Probabilities: [0.33333334 0.33333334 0.33333334]\n",
+ "Probabilities: [0.33333334 0.33333334 0.33333334]\n",
"array([[0, 0, 1, 2, 0, 0, 0, 0, 1, 0]])\n"
]
}
@@ -368,7 +416,7 @@
"tf.random.set_seed(1)\n",
"\n",
"logits = [[1.0, 1.0, 1.0]]\n",
- "print('Probabilities: ', tf.math.softmax(logits).numpy()[0])\n",
+ "print('Probabilities:', tf.math.softmax(logits).numpy()[0])\n",
"\n",
"samples = tf.random.categorical(\n",
" logits=logits, num_samples=10)\n",
@@ -384,7 +432,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Probabilities: [0.10650698 0.10650698 0.78698605]\n",
+ "Probabilities: [0.10650698 0.10650698 0.78698605]\n",
"array([[2, 0, 2, 2, 2, 0, 1, 2, 2, 0]])\n"
]
}
@@ -393,7 +441,7 @@
"tf.random.set_seed(1)\n",
"\n",
"logits = [[1.0, 1.0, 3.0]]\n",
- "print('Probabilities: ', tf.math.softmax(logits).numpy()[0])\n",
+ "print('Probabilities:', tf.math.softmax(logits).numpy()[0])\n",
"\n",
"samples = tf.random.categorical(\n",
" logits=logits, num_samples=10)\n",
@@ -409,22 +457,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "The island is open he heard the victory of the\n",
- "Mercy, and brought it into them, and they no longer continue, some on the little man of the felting circle of slopes.\n",
- "\n",
- "The engineer troused, he could not find our companions.\n",
- "\n",
- "\n",
- "\n",
- "Chapter 11\n",
- "\n",
- "At this position, he might just as if his first true to be finished, and he\n",
- "though not more I can this teles.”\n",
- "\n",
- "“Why shall fear line,” answered the reporter, “what a disposal silence was advanced with them, and in masterspon.\n",
- "\n",
- "Before three heights of the\n",
- "Frenchant Heights \n"
+ "The island may have been holled up the material recase from his mind now, they had no contact the firearms aroused them, but the first of the sun had a large must sean in a few minutes great best ton. Neb len through the edder the unfortunate, they had remained at the corral, the pointes of fall into the high genior narrow easied at their able themsevere\n",
+ "wished to protect the ordrain of the roup blad. On the land of their still more mounted by inflammunity cliff ordered the missol. There was on the bows, \n"
]
}
],
@@ -460,7 +494,7 @@
" return generated_str\n",
"\n",
"tf.random.set_seed(1)\n",
- "print(sample(model, starting_str=\"The island\", \n",
+ "print(sample(model, starting_str='The island', \n",
" scale_factor=1.0))"
]
},
@@ -493,68 +527,61 @@
"\n",
"print('Probabilities after scaling with 0.5:', tf.math.softmax(0.5*logits).numpy()[0])\n",
"\n",
- "print('Probabilities after scaling with 0.1:', tf.math.softmax(0.1*logits).numpy()[0])\n"
+ "print('Probabilities after scaling with 0.1:', tf.math.softmax(0.1*logits).numpy()[0])"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "The island spoke of heavy torn into the island from the sea.\n",
- "\n",
- "The noise of the inhabitants of the island was to be feared that the colonists had come a project with a straight be put to the bank of the island was the surface of the lake and sulphuric acid, and several supply of her animals. The first stranger carried a sort of accessible to break these screen barrels to their distance from the palisade.\n",
- "\n",
- "“The first huntil,” said the reporter, “and his companions the reporter extended to build a few days a\n"
+ "The island was so as to contracted the plateau of Prospect Heights was to be understood. The settlers were brought to the southern coast of the island and stood out of the colonists, who was also met with the palisade. The boat was not a sort of his mise, and they were in the midst of the point of the sea, and he should have been able to save him. “With a ready to the settlers were obliged to that he was the last side of the sea, the strength of the drawn and fell on the shore. The trees were fired at the\n"
]
}
],
"source": [
"tf.random.set_seed(1)\n",
- "print(sample(model, starting_str=\"The island\", \n",
+ "print(sample(model, starting_str='The island', \n",
" scale_factor=2.0))"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "The island\n",
- "glissed in\n",
- "ascercicedly useful? loigeh, Cyrus,\n",
- "Spileots,” henseporvemented\n",
- "House to a left\n",
- "the centlic moment. Tonsense craw.\n",
- "\n",
- "Pencrular ed/ of times,” tading had coflently often above anzand?”\n",
- "\n",
- "“Wat;” then:y.”\n",
- "\n",
+ "The island harned all\n",
+ "execumed, whether existed\n",
+ "in Freh oad, or\n",
+ "notp? atlisheve,\n",
+ "know Ivan arrih!. Dudgemed it; belongly, still likelowed on ewt.\n",
"\n",
- "Ardivify he acpearly, howcovered--he hassime; however, fenquests hen adgents!’.? Let us Neg eqiAl?.\n",
+ "An Aconnoun unvow Clave,\n",
+ "Ogen of.” criefly Harding,” observed\n",
+ "eating from Ta cimitlat. I.\n",
"\n",
- "GencNal, my surved thirtyin” ou; is Harding; treuths. Osew apartarned. “N,\n",
- "the poltuge of about-but durired with purteg.\n",
+ "These urislarsnifigent gaveAned it, there could ruffil!\n",
"\n",
- "Chappes wason!\n",
- "\n",
- "Fears,” returned Spilett; “if\n",
- "you tear 8t trung\n"
+ "Memisproy?--its east, ma, in\n",
+ "hearia! Austhours Oclas!” to” re!” ald he smumps without it did not plenwe Prescries? Certainly this\n",
+ "horns having putting\n",
+ "him meat.”\n",
+ "By damb Capteinggoine his plants-abavanoo;\n",
+ "near this \n"
]
}
],
"source": [
"tf.random.set_seed(1)\n",
- "print(sample(model, starting_str=\"The island\", \n",
+ "print(sample(model, starting_str='The island', \n",
" scale_factor=0.5))"
]
},
@@ -597,7 +624,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -605,20 +632,13 @@
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook ch16_part2.ipynb to script\n",
- "[NbConvertApp] Writing 5753 bytes to ch16_part2.py\n"
+ "[NbConvertApp] Writing 6472 bytes to ch16_part2.py\n"
]
}
],
"source": [
"! python ../.convert_notebook_to_script.py --input ch16_part2.ipynb --output ch16_part2.py"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -637,9 +657,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.7.3"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/ch16/ch16_part2.py b/ch16/ch16_part2.py
index 03bcd774..59bdf0f6 100644
--- a/ch16/ch16_part2.py
+++ b/ch16/ch16_part2.py
@@ -15,6 +15,12 @@
#
#
+# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
+
+
+
+
+
# ## Project two: character-level language modeling in TensorFlow
#
# ### Preprocessing the dataset
@@ -26,6 +32,7 @@
+
## Reading and processing text
with open('1268-0.txt', 'r') as fp:
text=fp.read()
@@ -36,8 +43,8 @@
text = text[start_indx:end_indx]
char_set = set(text)
-print('Total Length: ', len(text))
-print('Unique Characters: ', len(char_set))
+print('Total Length:', len(text))
+print('Unique Characters:', len(char_set))
@@ -52,8 +59,9 @@
print('Text encoded shape: ', text_encoded.shape)
-print(text[:15], ' == Encoding ==> ', text_encoded[:15])
-print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]]))
+print(text[:15], ' == Encoding ==> ', text_encoded[:15])
+print(text_encoded[15:21], ' == Reverse ==> ', ''.join(char_array[text_encoded[15:21]]))
+
@@ -93,8 +101,8 @@ def split_input_target(chunk):
## inspection:
for example in ds_sequences.take(2):
- print(' Input (x): ', repr(''.join(char_array[example[0].numpy()])))
- print('Target (y): ', repr(''.join(char_array[example[1].numpy()])))
+ print(' Input (x):', repr(''.join(char_array[example[0].numpy()])))
+ print('Target (y):', repr(''.join(char_array[example[1].numpy()])))
print()
@@ -156,7 +164,7 @@ def build_model(vocab_size, embedding_dim, rnn_units):
tf.random.set_seed(1)
logits = [[1.0, 1.0, 1.0]]
-print('Probabilities: ', tf.math.softmax(logits).numpy()[0])
+print('Probabilities:', tf.math.softmax(logits).numpy()[0])
samples = tf.random.categorical(
logits=logits, num_samples=10)
@@ -168,7 +176,7 @@ def build_model(vocab_size, embedding_dim, rnn_units):
tf.random.set_seed(1)
logits = [[1.0, 1.0, 3.0]]
-print('Probabilities: ', tf.math.softmax(logits).numpy()[0])
+print('Probabilities:', tf.math.softmax(logits).numpy()[0])
samples = tf.random.categorical(
logits=logits, num_samples=10)
@@ -208,7 +216,7 @@ def sample(model, starting_str,
return generated_str
tf.random.set_seed(1)
-print(sample(model, starting_str="The island",
+print(sample(model, starting_str='The island',
scale_factor=1.0))
@@ -228,14 +236,14 @@ def sample(model, starting_str,
tf.random.set_seed(1)
-print(sample(model, starting_str="The island",
+print(sample(model, starting_str='The island',
scale_factor=2.0))
tf.random.set_seed(1)
-print(sample(model, starting_str="The island",
+print(sample(model, starting_str='The island',
scale_factor=0.5))
@@ -266,8 +274,3 @@ def sample(model, starting_str,
-
-
-
-
-