diff --git a/ch16/ch16_part1.ipynb b/ch16/ch16_part1.ipynb index 82c1be9a..d014a277 100644 --- a/ch16/ch16_part1.ipynb +++ b/ch16/ch16_part1.ipynb @@ -37,11 +37,11 @@ "Sebastian Raschka & Vahid Mirjalili \n", "last updated: 2019-11-03 \n", "\n", - "numpy 1.17.2\n", - "scipy 1.2.1\n", - "matplotlib 3.1.0\n", + "numpy 1.17.3\n", + "scipy 1.3.1\n", + "matplotlib 3.1.1\n", "tensorflow 2.0.0\n", - "tensorflow_datasets 1.3.0\n" + "tensorflow_datasets 1.2.0\n" ] } ], @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -84,7 +84,7 @@ "" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": { "image/png": { "width": 700 @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -116,7 +116,7 @@ "" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": { "image/png": { "width": 700 @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -150,7 +150,7 @@ "" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": { "image/png": { "width": 700 @@ -165,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -175,7 +175,7 @@ "" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": { "image/png": { "width": 700 @@ -197,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -207,7 +207,7 @@ "" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": { "image/png": { "width": 700 @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -232,7 +232,7 @@ "" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": { "image/png": { "width": 700 @@ -254,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -264,7 +264,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": { "image/png": { "width": 700 @@ -279,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -310,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -379,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -389,7 +389,7 @@ "" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": { "image/png": { "width": 700 @@ -412,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -422,7 +422,7 @@ "" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": { "image/png": { "width": 700 @@ -449,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -461,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -506,27 +506,27 @@ " \n", " \n", " \n", - " 49995\n", + " 49995\n", " OK, lets start with the best. the building. al...\n", " 0\n", " \n", " \n", - " 49996\n", + " 49996\n", " The British 'heritage film' industry is out of...\n", " 0\n", " \n", " \n", - " 49997\n", + " 49997\n", " I don't even know where to begin on this one. ...\n", " 0\n", " \n", " \n", - " 49998\n", + " 49998\n", " Richard Tyler is a little boy who is scared of...\n", " 0\n", " \n", " \n", - " 49999\n", + " 49999\n", " I waited long to watch this movie. Also becaus...\n", " 1\n", " \n", @@ -543,7 +543,7 @@ "49999 I waited long to watch this movie. Also becaus... 1" ] }, - "execution_count": 6, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -556,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -591,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -624,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -652,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -661,7 +661,7 @@ "[232, 9, 270, 1123]" ] }, - "execution_count": 10, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -677,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -696,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -732,7 +732,7 @@ " )" ] }, - "execution_count": 12, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -777,7 +777,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -814,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -852,7 +852,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -862,7 +862,7 @@ "" ] }, - "execution_count": 15, + "execution_count": 25, "metadata": { "image/png": { "width": 700 @@ -877,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -934,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -979,7 +979,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1023,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1071,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1095,27 +1095,27 @@ "Non-trainable params: 0\n", "_________________________________________________________________\n", "Epoch 1/10\n", - "625/625 [==============================] - 268s 429ms/step - loss: 0.5440 - accuracy: 0.7143 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", + "625/625 [==============================] - 383s 613ms/step - loss: 0.5173 - accuracy: 0.7376 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", "Epoch 2/10\n", - "625/625 [==============================] - 262s 419ms/step - loss: 0.2753 - accuracy: 0.8903 - val_loss: 0.3489 - val_accuracy: 0.8634\n", + "625/625 [==============================] - 369s 590ms/step - loss: 0.2747 - accuracy: 0.8955 - val_loss: 0.3982 - val_accuracy: 0.8516\n", "Epoch 3/10\n", - "625/625 [==============================] - 262s 420ms/step - loss: 0.1378 - accuracy: 0.9514 - val_loss: 0.4153 - val_accuracy: 0.8434\n", + "625/625 [==============================] - 369s 590ms/step - loss: 0.1528 - accuracy: 0.9478 - val_loss: 0.4270 - val_accuracy: 0.8588\n", "Epoch 4/10\n", - "625/625 [==============================] - 263s 420ms/step - loss: 0.0685 - accuracy: 0.9778 - val_loss: 0.4895 - val_accuracy: 0.8450\n", + "625/625 [==============================] - 369s 590ms/step - loss: 0.0958 - accuracy: 0.9674 - val_loss: 0.6082 - val_accuracy: 0.8358\n", "Epoch 5/10\n", - "625/625 [==============================] - 263s 420ms/step - loss: 0.0470 - accuracy: 0.9861 - val_loss: 0.5602 - val_accuracy: 0.8228\n", + "625/625 [==============================] - 368s 589ms/step - loss: 0.0551 - accuracy: 0.9836 - val_loss: 0.6766 - val_accuracy: 0.8468\n", "Epoch 6/10\n", - "625/625 [==============================] - 263s 420ms/step - loss: 0.0383 - accuracy: 0.9884 - val_loss: 0.6552 - val_accuracy: 0.8200\n", + "625/625 [==============================] - 368s 590ms/step - loss: 0.0454 - accuracy: 0.9858 - val_loss: 0.6429 - val_accuracy: 0.8398\n", "Epoch 7/10\n", - "625/625 [==============================] - 262s 420ms/step - loss: 0.0294 - accuracy: 0.9916 - val_loss: 0.6579 - val_accuracy: 0.8380\n", + "625/625 [==============================] - 369s 591ms/step - loss: 0.0418 - accuracy: 0.9870 - val_loss: 0.6093 - val_accuracy: 0.8388\n", "Epoch 8/10\n", - "625/625 [==============================] - 262s 420ms/step - loss: 0.0321 - accuracy: 0.9894 - val_loss: 0.8191 - val_accuracy: 0.8182\n", + "625/625 [==============================] - 369s 590ms/step - loss: 0.0288 - accuracy: 0.9908 - val_loss: 0.7021 - val_accuracy: 0.8356\n", "Epoch 9/10\n", - "625/625 [==============================] - 263s 420ms/step - loss: 0.0552 - accuracy: 0.9809 - val_loss: 0.7586 - val_accuracy: 0.8496\n", + "625/625 [==============================] - 368s 590ms/step - loss: 0.0308 - accuracy: 0.9909 - val_loss: 0.7601 - val_accuracy: 0.8392\n", "Epoch 10/10\n", - "625/625 [==============================] - 262s 420ms/step - loss: 0.0252 - accuracy: 0.9921 - val_loss: 0.7784 - val_accuracy: 0.8474\n", - "782/782 [==============================] - 100s 128ms/step - loss: 0.7809 - accuracy: 0.8437\n", - "Test Acc.: 84.37%\n" + "625/625 [==============================] - 369s 590ms/step - loss: 0.0205 - accuracy: 0.9931 - val_loss: 0.8158 - val_accuracy: 0.8388\n", + "782/782 [==============================] - 144s 185ms/step - loss: 0.8131 - accuracy: 0.8392\n", + "Test Acc.: 83.92%\n" ] } ], @@ -1161,7 +1161,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -1181,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1239,7 +1239,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -1294,7 +1294,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1302,7 +1302,7 @@ "output_type": "stream", "text": [ "Vocab-size: 58063\n", - "Model: \"sequential_6\"\n", + "Model: \"sequential_5\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", @@ -1350,7 +1350,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1358,25 +1358,25 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "625/625 [==============================] - 54s 87ms/step - loss: 0.6997 - accuracy: 0.5013 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", + "625/625 [==============================] - 68s 109ms/step - loss: 0.7055 - accuracy: 0.5026 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", "Epoch 2/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.6741 - accuracy: 0.5641 - val_loss: 0.7000 - val_accuracy: 0.6032\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.7037 - accuracy: 0.5113 - val_loss: 0.6953 - val_accuracy: 0.5072\n", "Epoch 3/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.4976 - accuracy: 0.7637 - val_loss: 0.5935 - val_accuracy: 0.6918\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.6954 - accuracy: 0.5213 - val_loss: 0.6884 - val_accuracy: 0.5452\n", "Epoch 4/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.3362 - accuracy: 0.8600 - val_loss: 0.4991 - val_accuracy: 0.7818\n", + "625/625 [==============================] - 67s 106ms/step - loss: 0.6317 - accuracy: 0.6259 - val_loss: 0.6152 - val_accuracy: 0.6522\n", "Epoch 5/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.2168 - accuracy: 0.9157 - val_loss: 0.4592 - val_accuracy: 0.8118\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.4913 - accuracy: 0.7629 - val_loss: 0.7798 - val_accuracy: 0.6278\n", "Epoch 6/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.1549 - accuracy: 0.9422 - val_loss: 0.6382 - val_accuracy: 0.7686\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.4252 - accuracy: 0.8094 - val_loss: 0.6105 - val_accuracy: 0.7432\n", "Epoch 7/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.0899 - accuracy: 0.9696 - val_loss: 0.6741 - val_accuracy: 0.7848\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.4100 - accuracy: 0.8122 - val_loss: 0.6718 - val_accuracy: 0.7016\n", "Epoch 8/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.0659 - accuracy: 0.9777 - val_loss: 0.6549 - val_accuracy: 0.8022\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.3749 - accuracy: 0.8281 - val_loss: 0.7762 - val_accuracy: 0.6348\n", "Epoch 9/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.0710 - accuracy: 0.9758 - val_loss: 0.6970 - val_accuracy: 0.7830\n", + "625/625 [==============================] - 66s 106ms/step - loss: 0.2711 - accuracy: 0.8813 - val_loss: 0.7503 - val_accuracy: 0.6892\n", "Epoch 10/10\n", - "625/625 [==============================] - 52s 83ms/step - loss: 0.0476 - accuracy: 0.9839 - val_loss: 0.7840 - val_accuracy: 0.8038\n" + "625/625 [==============================] - 66s 106ms/step - loss: 0.1665 - accuracy: 0.9367 - val_loss: 0.7172 - val_accuracy: 0.7454\n" ] } ], @@ -1394,14 +1394,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "782/782 [==============================] - 28s 36ms/step - loss: 0.7896 - accuracy: 0.8070\n" + "782/782 [==============================] - 39s 50ms/step - loss: 0.7098 - accuracy: 0.7478\n" ] } ], @@ -1411,14 +1411,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Test Acc.: 80.70%\n" + "Test Acc.: 74.78%\n" ] } ], @@ -1437,7 +1437,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1445,7 +1445,7 @@ "output_type": "stream", "text": [ "Vocab-size: 87007\n", - "Model: \"sequential_7\"\n", + "Model: \"sequential_6\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1498,25 +1498,25 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "625/625 [==============================] - 178s 285ms/step - loss: 0.6991 - accuracy: 0.5019 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", + "625/625 [==============================] - 225s 361ms/step - loss: 0.6993 - accuracy: 0.5034 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n", "Epoch 2/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6983 - accuracy: 0.4920 - val_loss: 0.6931 - val_accuracy: 0.5108\n", + "625/625 [==============================] - 225s 361ms/step - loss: 0.6993 - accuracy: 0.5006 - val_loss: 0.6998 - val_accuracy: 0.5010\n", "Epoch 3/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6949 - accuracy: 0.4975 - val_loss: 0.6930 - val_accuracy: 0.5128\n", + "625/625 [==============================] - 226s 361ms/step - loss: 0.6968 - accuracy: 0.5034 - val_loss: 0.6960 - val_accuracy: 0.5004\n", "Epoch 4/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6945 - accuracy: 0.5002 - val_loss: 0.6929 - val_accuracy: 0.5126\n", + "625/625 [==============================] - 225s 360ms/step - loss: 0.6958 - accuracy: 0.5038 - val_loss: 0.6953 - val_accuracy: 0.5002\n", "Epoch 5/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6944 - accuracy: 0.5058 - val_loss: 0.6936 - val_accuracy: 0.5056\n", + "625/625 [==============================] - 226s 361ms/step - loss: 0.6952 - accuracy: 0.5045 - val_loss: 0.6947 - val_accuracy: 0.4952\n", "Epoch 6/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6959 - accuracy: 0.5045 - val_loss: 0.6932 - val_accuracy: 0.5118\n", + "625/625 [==============================] - 226s 361ms/step - loss: 0.6945 - accuracy: 0.5080 - val_loss: 0.6945 - val_accuracy: 0.4956\n", "Epoch 7/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6943 - accuracy: 0.5029 - val_loss: 0.6932 - val_accuracy: 0.5124\n", + "625/625 [==============================] - 225s 360ms/step - loss: 0.6940 - accuracy: 0.5109 - val_loss: 0.6945 - val_accuracy: 0.4960\n", "Epoch 8/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6937 - accuracy: 0.5059 - val_loss: 0.6932 - val_accuracy: 0.5128\n", + "625/625 [==============================] - 225s 360ms/step - loss: 0.6936 - accuracy: 0.5110 - val_loss: 0.6963 - val_accuracy: 0.4968\n", "Epoch 9/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6934 - accuracy: 0.5053 - val_loss: 0.6930 - val_accuracy: 0.5124\n", + "625/625 [==============================] - 226s 361ms/step - loss: 0.6934 - accuracy: 0.5101 - val_loss: 0.6948 - val_accuracy: 0.4960\n", "Epoch 10/10\n", - "625/625 [==============================] - 177s 283ms/step - loss: 0.6931 - accuracy: 0.5088 - val_loss: 0.6930 - val_accuracy: 0.5130\n" + "625/625 [==============================] - 225s 361ms/step - loss: 0.6935 - accuracy: 0.5096 - val_loss: 0.6936 - val_accuracy: 0.4986\n" ] } ], @@ -1547,7 +1547,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -1584,1488 +1584,183 @@ " }\"\"\",\n", " redistribution_info=,\n", ")\n", - "\n", - "\u001b[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /home/raschka/tensorflow_datasets/imdb_reviews/plain_text/0.1.0...\u001b[0m\n" + "\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "de79e74b2c204490bac94be76c59f0a5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "538b24048f2a4307b2570bbcd6b4ab00", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…" + "dict_keys(['test', 'train', 'unsupervised'])" ] }, + "execution_count": 40, "metadata": {}, - "output_type": "display_data" - }, + "output_type": "execute_result" + } + ], + "source": [ + "imdb_bldr = tfds.builder('imdb_reviews')\n", + "print(imdb_bldr.info)\n", + "\n", + "imdb_bldr.download_and_prepare()\n", + "\n", + "datasets = imdb_bldr.as_dataset(shuffle_files=False)\n", + "\n", + "datasets.keys()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "imdb_train = datasets['train']\n", + "imdb_train = datasets['test']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### B -- Tokenizer and Encoder\n", + "\n", + " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n", + " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "\n", - "\n", - "\n" + "\n", + "[2, 1, 4, 3]\n", + "[2, 1, 4, 3, 5, 5, 5, 5, 5, 5]\n" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5be14932e00141e296b58813c2aed3e3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, + } + ], + "source": [ + "vocab_set = {'a', 'b', 'c', 'd'}\n", + "encoder = tfds.features.text.TokenTextEncoder(vocab_set)\n", + "print(encoder)\n", + "\n", + "print(encoder.encode(b'a b c d, , : .'))\n", + "\n", + "print(encoder.encode(b'a b c d e f g h i z'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### C -- Text Pre-processing with Keras " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\r" + "[[1, 2, 3, 4], [5, 6, 7, 8]]\n" ] }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "00f49d8f9d6148fea84e304e312c4a3f", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Shuffling...', max=10, style=ProgressStyle(description_width=…" + "array([[0, 0, 0, 0, 0, 0, 1, 2, 3, 4],\n", + " [0, 0, 0, 0, 0, 0, 5, 6, 7, 8]], dtype=int32)" ] }, + "execution_count": 43, "metadata": {}, - "output_type": "display_data" - }, + "output_type": "execute_result" + } + ], + "source": [ + "TOP_K = 200\n", + "MAX_LEN = 10\n", + "\n", + "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n", + "\n", + "tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])\n", + "sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])\n", + "print(sequences)\n", + "\n", + "tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/raschka/miniconda3/lib/python3.7/site-packages/tensorflow_datasets/core/file_format_adapter.py:209: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use eager execution and: \n", - "`tf.data.TFRecordDataset(path)`\n" + "25000\n", + "(25000, 500)\n" ] - }, + } + ], + "source": [ + "TOP_K = 20000\n", + "MAX_LEN = 500\n", + "\n", + "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n", + "\n", + "tokenizer.fit_on_texts(\n", + " [example['text'].numpy().decode('utf-8') \n", + " for example in imdb_train])\n", + "\n", + "x_train = tokenizer.texts_to_sequences(\n", + " [example['text'].numpy().decode('utf-8')\n", + " for example in imdb_train])\n", + "\n", + "print(len(x_train))\n", + "\n", + "\n", + "x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(\n", + " x_train, maxlen=MAX_LEN)\n", + "\n", + "print(x_train_padded.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### D -- Embedding\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/raschka/miniconda3/lib/python3.7/site-packages/tensorflow_datasets/core/file_format_adapter.py:209: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use eager execution and: \n", - "`tf.data.TFRecordDataset(path)`\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "38d1a86661954cae828ad256d40a92bc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ccc6d64908494ca69c31c17ed9d3fc8a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9cdb75bbe50c44d7b98ddee86990a056", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f629460148724702a0dc07f39a1036fb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5d2bc744109141e8af87a090b7f0bd79", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f65100cfcff9421891c3aca5f92e6e35", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b0357aa53a274fc6aba4e9383937ce32", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c9b5a917709f4ca58804e20b6e57763e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aa9565276fec427083574782b7a978b7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7ca461ea11be4af184f432cbb612e5f2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "636e631bbd594aebb0c33111e0ee7d7d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0932df82a331465c8d82cc69dbb0521f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "56b715c451f24fa9bbfe8d7214e89d0e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d2312c56af214db181801c1a35437553", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3d058093762f40beb363ee7c3e282609", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3a9ecefeac964b07bf14b89c21575735", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e641fc32d8cd43a985f4aa8467c3f1c8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fdf16f70917a4b47b7840b4b6186df4d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c2cdeaf540764609a896ec59e38feaaa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b0ca0e9241b540139ce759425c71890d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "11bd659783ec45e98be7caba5c11ef18", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cbb812579b0e446dafefdf5eccdc8612", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Shuffling...', max=10, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5b93a9d9b9c549be95ea73d507693ba5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f32dae170cd449a69ec862d20641133a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fab06d6b0b3c469d8f4051bd96e9aa71", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "71d2f0d35e4e495e8eb6eaa45ec28c32", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d03e6d401cfc4d09a784b3fedc9c4df9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9f158c0c614348029a1573a5ee738f49", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "77c83c93259f48cd96bdc3b9f999ac12", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8ec339e2dfa34ae5936d0f2967ce4ed1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7f21b47ffde14506bb446928510dad38", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "941941e809f7481db75aaac0db3611d1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f9f58fc3f4344318b7bf62710b51d924", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c52c15d2303a4d6cb4ab4a64018d69c5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3a8d6944bff74d36bcd2e17639e7f64b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "47763715394b49618453989273c15a38", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c50a3c53780b4e4297a078ada405734a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b175e9e1fc074ccb9df408b1d8c7481a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "45291f47ad2a48f79c707485c2d8994e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a2ca9a9a1a044a30bc234e4b763640d3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a6323fb1eb2847e7b15778840a4df2b1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4402375d75d541ecb87ec1581d0087c0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "885b9473e3d544a3b6b97a2e6cd84210", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c332b6392ce14b63aedf87ab9ba548d9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Shuffling...', max=20, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9f134ac7d9794013acd4de7fd85df3c8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "703bd98b621c4db8b492b1009ba6b8f2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6795e1b6cc794f6f971b2436787a51f6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bb3d851c2a63462c8856c6c58aa1809e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ebe4b3cc57ce4f87a7095e271c481429", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6e9a9b4e0f01488eaf79dcb62906fd28", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2e89ff3e960f4fae9ada9990cbf53fc1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fe268d74efdb4cee9e6283285b390050", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5f10167829704d9aafb8b4908f8598fd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a030df39d3b847649964e0efda3914e1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0f5f4d2d423c46e4aa5267bc9394e9e2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e2ebfe1b9d2a407081a6159adb873613", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2b37cd9626124d669a42162466e49b9b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "22bf09a24a2d4972b82c2caf15bd15bc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "eeed7bd1d3074be6baaabc7bad706565", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fc32c7db4618406384f739c816bd9cfd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2810227adcf54e479c0eb3a3b8d13b4f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5252aa813ea04294a5fceec21aeed1ea", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "169e22f658ac41d4809753405ccb3df2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "861198bcff3842f1a35544fd982505de", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8fad9d16feb3446191b7a0d8430b5886", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "915c8888fa5b4a24ba3ec935f2924955", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0f378c95e1d448a3b0fab61759b91d69", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c68480a4635e4b7993649df9f50c173f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1d1cc390ed3d4dfd86de7743a39f1e59", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b19fe83393624220a6dace6c965e2830", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c30986d29dc64281b62f35fce1ffdbfb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9916f0608c454c558fe349c8a308388b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0027ed54de6e4010be6d833d38303ebd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f395c711d04f46fc92a97153f2fd0688", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ea071488d6464996981af2af51c043d3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0dd6e0e5e15647d9acc9d8a7096e5ecb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0832cbbf73a94d5ab281b4cb26eba9d6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "592be5c9161d444d8b81b8be08a879e5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5537626415c94ceea6db81b5043cb861", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3fc229b5336f4b81943586079469dff9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e336e5505bdc48eb91ce493b942f098a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4daf1ba7e9bf4cb4b21948197d8b3e7f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1d89a6af1797401ba8c9b64755bf5de6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', description='Reading...', max=1, style=ProgressStyle(des…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4a147ea88311404488d6601531ba5350", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Writing...', max=2500, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mDataset imdb_reviews downloaded and prepared to /home/raschka/tensorflow_datasets/imdb_reviews/plain_text/0.1.0. Subsequent calls will reuse this data.\u001b[0m\n" - ] - }, - { - "data": { - "text/plain": [ - "dict_keys(['test', 'train', 'unsupervised'])" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "imdb_bldr = tfds.builder('imdb_reviews')\n", - "print(imdb_bldr.info)\n", - "\n", - "imdb_bldr.download_and_prepare()\n", - "\n", - "datasets = imdb_bldr.as_dataset(shuffle_files=False)\n", - "\n", - "datasets.keys()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "imdb_train = datasets['train']\n", - "imdb_train = datasets['test']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### B -- Tokenizer and Encoder\n", - "\n", - " * `tfds.features.text.Tokenizer`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/Tokenizer\n", - " * `tfds.features.text.TokenTextEncoder`: https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TokenTextEncoder\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[4, 3, 2, 1]\n", - "[4, 3, 2, 1, 5, 5, 5, 5, 5, 5]\n" - ] - } - ], - "source": [ - "vocab_set = {'a', 'b', 'c', 'd'}\n", - "encoder = tfds.features.text.TokenTextEncoder(vocab_set)\n", - "print(encoder)\n", - "\n", - "print(encoder.encode(b'a b c d, , : .'))\n", - "\n", - "print(encoder.encode(b'a b c d e f g h i z'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### C -- Text Pre-processing with Keras " - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[1, 2, 3, 4], [5, 6, 7, 8]]\n" - ] - }, - { - "data": { - "text/plain": [ - "array([[0, 0, 0, 0, 0, 0, 1, 2, 3, 4],\n", - " [0, 0, 0, 0, 0, 0, 5, 6, 7, 8]], dtype=int32)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TOP_K = 200\n", - "MAX_LEN = 10\n", - "\n", - "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n", - "\n", - "tokenizer.fit_on_texts(['this is an example', 'je suis en forme '])\n", - "sequences = tokenizer.texts_to_sequences(['this is an example', 'je suis en forme '])\n", - "print(sequences)\n", - "\n", - "tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "25000\n", - "(25000, 500)\n" - ] - } - ], - "source": [ - "TOP_K = 20000\n", - "MAX_LEN = 500\n", - "\n", - "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOP_K)\n", - "\n", - "tokenizer.fit_on_texts(\n", - " [example['text'].numpy().decode('utf-8') \n", - " for example in imdb_train])\n", - "\n", - "x_train = tokenizer.texts_to_sequences(\n", - " [example['text'].numpy().decode('utf-8')\n", - " for example in imdb_train])\n", - "\n", - "print(len(x_train))\n", - "\n", - "\n", - "x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(\n", - " x_train, maxlen=MAX_LEN)\n", - "\n", - "print(x_train_padded.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### D -- Embedding\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]\n", - " [-0.00420691818 -0.0375086069 -0.00477621704 0.00311584398]\n", - " [0.028728161 -0.0440448038 -0.0428906195 -0.019158531]\n", - " [-0.0248817336 0.0408470519 -0.00285203382 -0.0257614851]\n", - " [0.0443614833 0.00331580639 0.043055404 -0.011118304]\n", - " [-0.0281324144 0.00720113516 0.0192188732 -0.0186921246]]\n", - "TensorShape([6, 4])\n", - "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]]\n" + "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]\n", + " [-0.00420691818 -0.0375086069 -0.00477621704 0.00311584398]\n", + " [0.028728161 -0.0440448038 -0.0428906195 -0.019158531]\n", + " [-0.0248817336 0.0408470519 -0.00285203382 -0.0257614851]\n", + " [0.0443614833 0.00331580639 0.043055404 -0.011118304]\n", + " [-0.0281324144 0.00720113516 0.0192188732 -0.0186921246]]\n", + "TensorShape([6, 4])\n", + "[[-0.0208060984 0.0142502077 0.0475785471 -0.00649005175]]\n" ] } ], @@ -3104,7 +1799,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -3112,7 +1807,7 @@ "output_type": "stream", "text": [ "[NbConvertApp] Converting notebook ch16_part1.ipynb to script\n", - "[NbConvertApp] Writing 16576 bytes to ch16_part1.py\n" + "[NbConvertApp] Writing 17277 bytes to ch16_part1.py\n" ] } ], diff --git a/ch16/ch16_part1.py b/ch16/ch16_part1.py index 0a4e6cf8..19fdf42a 100644 --- a/ch16/ch16_part1.py +++ b/ch16/ch16_part1.py @@ -31,22 +31,57 @@ + + + + # # Introducing sequential data # # ## Modeling sequential data⁠—order matters # # ## Representing sequences # +# + + + + + # ## The different categories of sequence modeling + + + + # # RNNs for modeling sequences # # ## Understanding the RNN looping mechanism # + + + + + + + + + # ## Computing activations in an RNN # + + + + + + + + + # ## Hidden-recurrence vs. output-recurrence -# + + + + @@ -98,9 +133,19 @@ # ## The challenges of learning long-range interactions +# + + + + + # # ## Long Short-Term Memory cells + + + + # # Implementing RNNs for sequence modeling in TensorFlow # # ## Project one: predicting the sentiment of IMDb movie reviews @@ -261,7 +306,7 @@ def encode_map_fn(text, label): 32, padded_shapes=([-1],[])) -# #### Embedding layers for sentence encoding +# ### Embedding layers for sentence encoding # # # * `input_dim`: number of words, i.e. maximum integer index + 1. @@ -281,6 +326,10 @@ def encode_map_fn(text, label): + + + + model = tf.keras.Sequential() model.add(Embedding(input_dim=100,