{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "### Processing words as a sequence: The sequence model approach"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "#### A first practical example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Downloading the data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n",
    "!tar -xf aclImdb_v1.tar.gz\n",
    "!rm -r aclImdb/train/unsup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Preparing the data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "import os, pathlib, shutil, random\n",
    "from tensorflow import keras\n",
    "batch_size = 32\n",
    "base_dir = pathlib.Path(\"aclImdb\")\n",
    "val_dir = base_dir / \"val\"\n",
    "train_dir = base_dir / \"train\"\n",
    "for category in (\"neg\", \"pos\"):\n",
    "    os.makedirs(val_dir / category)\n",
    "    files = os.listdir(train_dir / category)\n",
    "    random.Random(1337).shuffle(files)\n",
    "    num_val_samples = int(0.2 * len(files))\n",
    "    val_files = files[-num_val_samples:]\n",
    "    for fname in val_files:\n",
    "        shutil.move(train_dir / category / fname,\n",
    "                    val_dir / category / fname)\n",
    "\n",
    "train_ds = keras.utils.text_dataset_from_directory(\n",
    "    \"aclImdb/train\", batch_size=batch_size\n",
    ")\n",
    "val_ds = keras.utils.text_dataset_from_directory(\n",
    "    \"aclImdb/val\", batch_size=batch_size\n",
    ")\n",
    "test_ds = keras.utils.text_dataset_from_directory(\n",
    "    \"aclImdb/test\", batch_size=batch_size\n",
    ")\n",
    "text_only_train_ds = train_ds.map(lambda x, y: x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Preparing integer sequence datasets**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras import layers\n",
    "\n",
    "max_length = 600\n",
    "max_tokens = 20000\n",
    "text_vectorization = layers.TextVectorization(\n",
    "    max_tokens=max_tokens,\n",
    "    output_mode=\"int\",\n",
    "    output_sequence_length=max_length,\n",
    ")\n",
    "text_vectorization.adapt(text_only_train_ds)\n",
    "\n",
    "int_train_ds = train_ds.map(\n",
    "    lambda x, y: (text_vectorization(x), y),\n",
    "    num_parallel_calls=4)\n",
    "int_val_ds = val_ds.map(\n",
    "    lambda x, y: (text_vectorization(x), y),\n",
    "    num_parallel_calls=4)\n",
    "int_test_ds = test_ds.map(\n",
    "    lambda x, y: (text_vectorization(x), y),\n",
    "    num_parallel_calls=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**A sequence model built on one-hot encoded vector sequences**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
    "embedded = tf.one_hot(inputs, depth=max_tokens)\n",
    "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n",
    "x = layers.Dropout(0.5)(x)\n",
    "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
    "model = keras.Model(inputs, outputs)\n",
    "model.compile(optimizer=\"rmsprop\",\n",
    "              loss=\"binary_crossentropy\",\n",
    "              metrics=[\"accuracy\"])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Training a first basic sequence model**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "callbacks = [\n",
    "    keras.callbacks.ModelCheckpoint(\"one_hot_bidir_lstm.keras\",\n",
    "                                    save_best_only=True)\n",
    "]\n",
    "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n",
    "model = keras.models.load_model(\"one_hot_bidir_lstm.keras\")\n",
    "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "#### Understanding word embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "#### Learning word embeddings with the Embedding layer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Instantiating an `Embedding` layer**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Model that uses an `Embedding` layer trained from scratch**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
    "embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)\n",
    "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n",
    "x = layers.Dropout(0.5)(x)\n",
    "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
    "model = keras.Model(inputs, outputs)\n",
    "model.compile(optimizer=\"rmsprop\",\n",
    "              loss=\"binary_crossentropy\",\n",
    "              metrics=[\"accuracy\"])\n",
    "model.summary()\n",
    "\n",
    "callbacks = [\n",
    "    keras.callbacks.ModelCheckpoint(\"embeddings_bidir_gru.keras\",\n",
    "                                    save_best_only=True)\n",
    "]\n",
    "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n",
    "model = keras.models.load_model(\"embeddings_bidir_gru.keras\")\n",
    "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "#### Understanding padding and masking"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Using an `Embedding` layer with masking enabled**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
    "embedded = layers.Embedding(\n",
    "    input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)\n",
    "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n",
    "x = layers.Dropout(0.5)(x)\n",
    "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
    "model = keras.Model(inputs, outputs)\n",
    "model.compile(optimizer=\"rmsprop\",\n",
    "              loss=\"binary_crossentropy\",\n",
    "              metrics=[\"accuracy\"])\n",
    "model.summary()\n",
    "\n",
    "callbacks = [\n",
    "    keras.callbacks.ModelCheckpoint(\"embeddings_bidir_gru_with_masking.keras\",\n",
    "                                    save_best_only=True)\n",
    "]\n",
    "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n",
    "model = keras.models.load_model(\"embeddings_bidir_gru_with_masking.keras\")\n",
    "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "#### Using pretrained word embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "!wget http://nlp.stanford.edu/data/glove.6B.zip\n",
    "!unzip -q glove.6B.zip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Parsing the GloVe word-embeddings file**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "path_to_glove_file = \"glove.6B.100d.txt\"\n",
    "\n",
    "embeddings_index = {}\n",
    "with open(path_to_glove_file) as f:\n",
    "    for line in f:\n",
    "        word, coefs = line.split(maxsplit=1)\n",
    "        coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
    "        embeddings_index[word] = coefs\n",
    "\n",
    "print(f\"Found {len(embeddings_index)} word vectors.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Preparing the GloVe word-embeddings matrix**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "embedding_dim = 100\n",
    "\n",
    "vocabulary = text_vectorization.get_vocabulary()\n",
    "word_index = dict(zip(vocabulary, range(len(vocabulary))))\n",
    "\n",
    "embedding_matrix = np.zeros((max_tokens, embedding_dim))\n",
    "for word, i in word_index.items():\n",
    "    if i < max_tokens:\n",
    "        embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        embedding_matrix[i] = embedding_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "embedding_layer = layers.Embedding(\n",
    "    max_tokens,\n",
    "    embedding_dim,\n",
    "    embeddings_initializer=keras.initializers.Constant(embedding_matrix),\n",
    "    trainable=False,\n",
    "    mask_zero=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text"
   },
   "source": [
    "**Model that uses a pretrained Embedding layer**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab_type": "code"
   },
   "outputs": [],
   "source": [
    "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
    "embedded = embedding_layer(inputs)\n",
    "x = layers.Bidirectional(layers.LSTM(32))(embedded)\n",
    "x = layers.Dropout(0.5)(x)\n",
    "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
    "model = keras.Model(inputs, outputs)\n",
    "model.compile(optimizer=\"rmsprop\",\n",
    "              loss=\"binary_crossentropy\",\n",
    "              metrics=[\"accuracy\"])\n",
    "model.summary()\n",
    "\n",
    "callbacks = [\n",
    "    keras.callbacks.ModelCheckpoint(\"glove_embeddings_sequence_model.keras\",\n",
    "                                    save_best_only=True)\n",
    "]\n",
    "model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)\n",
    "model = keras.models.load_model(\"glove_embeddings_sequence_model.keras\")\n",
    "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "chapter11_part02_sequence-models.i",
   "private_outputs": false,
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}