{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6." ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Processing words as a sequence: The sequence model approach" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### A first practical example" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Downloading the data**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", "!tar -xf aclImdb_v1.tar.gz\n", "!rm -r aclImdb/train/unsup" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Preparing the data**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import os, pathlib, shutil, random\n", "from tensorflow import keras\n", "batch_size = 32\n", "base_dir = pathlib.Path(\"aclImdb\")\n", "val_dir = base_dir / \"val\"\n", "train_dir = base_dir / \"train\"\n", "for category in (\"neg\", \"pos\"):\n", " os.makedirs(val_dir / category)\n", " files = os.listdir(train_dir / category)\n", " random.Random(1337).shuffle(files)\n", " num_val_samples = int(0.2 * len(files))\n", " val_files = files[-num_val_samples:]\n", " for fname in val_files:\n", " shutil.move(train_dir / category / fname,\n", " val_dir / category / fname)\n", "\n", "train_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/train\", batch_size=batch_size\n", ")\n", "val_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/val\", batch_size=batch_size\n", ")\n", "test_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/test\", batch_size=batch_size\n", ")\n", "text_only_train_ds = train_ds.map(lambda x, y: x)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Preparing integer sequence datasets**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "from tensorflow.keras import layers\n", "\n", "max_length = 600\n", "max_tokens = 20000\n", "text_vectorization = layers.TextVectorization(\n", " max_tokens=max_tokens,\n", " output_mode=\"int\",\n", " output_sequence_length=max_length,\n", ")\n", "text_vectorization.adapt(text_only_train_ds)\n", "\n", "int_train_ds = train_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "int_val_ds = val_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "int_test_ds = test_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**A sequence model built on one-hot encoded vector sequences**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import tensorflow as tf\n", "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "embedded = tf.one_hot(inputs, depth=max_tokens)\n", "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Training a first basic sequence model**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"one_hot_bidir_lstm.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n", "model = keras.models.load_model(\"one_hot_bidir_lstm.keras\")\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Understanding word embeddings" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Learning word embeddings with the Embedding layer" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Instantiating an `Embedding` layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Model that uses an `Embedding` layer trained from scratch**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)\n", "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()\n", "\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"embeddings_bidir_gru.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n", "model = keras.models.load_model(\"embeddings_bidir_gru.keras\")\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Understanding padding and masking" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Using an `Embedding` layer with masking enabled**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "embedded = layers.Embedding(\n", " input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)\n", "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()\n", "\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"embeddings_bidir_gru_with_masking.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n", "model = keras.models.load_model(\"embeddings_bidir_gru_with_masking.keras\")\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Using pretrained word embeddings" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!wget http://nlp.stanford.edu/data/glove.6B.zip\n", "!unzip -q glove.6B.zip" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Parsing the GloVe word-embeddings file**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import numpy as np\n", "path_to_glove_file = \"glove.6B.100d.txt\"\n", "\n", "embeddings_index = {}\n", "with open(path_to_glove_file) as f:\n", " for line in f:\n", " word, coefs = line.split(maxsplit=1)\n", " coefs = np.fromstring(coefs, \"f\", sep=\" \")\n", " embeddings_index[word] = coefs\n", "\n", "print(f\"Found {len(embeddings_index)} word vectors.\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Preparing the GloVe word-embeddings matrix**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "embedding_dim = 100\n", "\n", "vocabulary = text_vectorization.get_vocabulary()\n", "word_index = dict(zip(vocabulary, range(len(vocabulary))))\n", "\n", "embedding_matrix = np.zeros((max_tokens, embedding_dim))\n", "for word, i in word_index.items():\n", " if i < max_tokens:\n", " embedding_vector = embeddings_index.get(word)\n", " if embedding_vector is not None:\n", " embedding_matrix[i] = embedding_vector" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "embedding_layer = layers.Embedding(\n", " max_tokens,\n", " embedding_dim,\n", " embeddings_initializer=keras.initializers.Constant(embedding_matrix),\n", " trainable=False,\n", " mask_zero=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Model that uses a pretrained Embedding layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "embedded = embedding_layer(inputs)\n", "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()\n", "\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"glove_embeddings_sequence_model.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n", "model = keras.models.load_model(\"glove_embeddings_sequence_model.keras\")\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "chapter11_part02_sequence-models.i", "private_outputs": false, "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 0 }