{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6." ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "# Deep learning for text" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "## Natural-language processing: The bird's eye view" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "## Preparing text data" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Text standardization" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Text splitting (tokenization)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Vocabulary indexing" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Using the TextVectorization layer" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import string\n", "\n", "class Vectorizer:\n", " def standardize(self, text):\n", " text = text.lower()\n", " return \"\".join(char for char in text if char not in string.punctuation)\n", "\n", " def tokenize(self, text):\n", " text = self.standardize(text)\n", " return text.split()\n", "\n", " def make_vocabulary(self, dataset):\n", " self.vocabulary = {\"\": 0, \"[UNK]\": 1}\n", " for text in dataset:\n", " text = self.standardize(text)\n", " tokens = self.tokenize(text)\n", " for token in tokens:\n", " if token not in self.vocabulary:\n", " self.vocabulary[token] = len(self.vocabulary)\n", " self.inverse_vocabulary = dict(\n", " (v, k) for k, v in self.vocabulary.items())\n", "\n", " def encode(self, text):\n", " text = self.standardize(text)\n", " tokens = self.tokenize(text)\n", " return [self.vocabulary.get(token, 1) for token in tokens]\n", "\n", " def decode(self, int_sequence):\n", " return \" \".join(\n", " self.inverse_vocabulary.get(i, \"[UNK]\") for i in int_sequence)\n", "\n", "vectorizer = Vectorizer()\n", "dataset = [\n", " \"I write, erase, rewrite\",\n", " \"Erase again, and then\",\n", " \"A poppy blooms.\",\n", "]\n", "vectorizer.make_vocabulary(dataset)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "test_sentence = \"I write, rewrite, and still rewrite again\"\n", "encoded_sentence = vectorizer.encode(test_sentence)\n", "print(encoded_sentence)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "decoded_sentence = vectorizer.decode(encoded_sentence)\n", "print(decoded_sentence)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "from tensorflow.keras.layers import TextVectorization\n", "text_vectorization = TextVectorization(\n", " output_mode=\"int\",\n", ")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import re\n", "import string\n", "import tensorflow as tf\n", "\n", "def custom_standardization_fn(string_tensor):\n", " lowercase_string = tf.strings.lower(string_tensor)\n", " return tf.strings.regex_replace(\n", " lowercase_string, f\"[{re.escape(string.punctuation)}]\", \"\")\n", "\n", "def custom_split_fn(string_tensor):\n", " return tf.strings.split(string_tensor)\n", "\n", "text_vectorization = TextVectorization(\n", " output_mode=\"int\",\n", " standardize=custom_standardization_fn,\n", " split=custom_split_fn,\n", ")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "dataset = [\n", " \"I write, erase, rewrite\",\n", " \"Erase again, and then\",\n", " \"A poppy blooms.\",\n", "]\n", "text_vectorization.adapt(dataset)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Displaying the vocabulary**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization.get_vocabulary()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "vocabulary = text_vectorization.get_vocabulary()\n", "test_sentence = \"I write, rewrite, and still rewrite again\"\n", "encoded_sentence = text_vectorization(test_sentence)\n", "print(encoded_sentence)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "inverse_vocab = dict(enumerate(vocabulary))\n", "decoded_sentence = \" \".join(inverse_vocab[int(i)] for i in encoded_sentence)\n", "print(decoded_sentence)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "## Two approaches for representing groups of words: Sets and sequences" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Preparing the IMDB movie reviews data" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", "!tar -xf aclImdb_v1.tar.gz" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!rm -r aclImdb/train/unsup" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!cat aclImdb/train/pos/4077_10.txt" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import os, pathlib, shutil, random\n", "\n", "base_dir = pathlib.Path(\"aclImdb\")\n", "val_dir = base_dir / \"val\"\n", "train_dir = base_dir / \"train\"\n", "for category in (\"neg\", \"pos\"):\n", " os.makedirs(val_dir / category)\n", " files = os.listdir(train_dir / category)\n", " random.Random(1337).shuffle(files)\n", " num_val_samples = int(0.2 * len(files))\n", " val_files = files[-num_val_samples:]\n", " for fname in val_files:\n", " shutil.move(train_dir / category / fname,\n", " val_dir / category / fname)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "from tensorflow import keras\n", "batch_size = 32\n", "\n", "train_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/train\", batch_size=batch_size\n", ")\n", "val_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/val\", batch_size=batch_size\n", ")\n", "test_ds = keras.utils.text_dataset_from_directory(\n", " \"aclImdb/test\", batch_size=batch_size\n", ")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Displaying the shapes and dtypes of the first batch**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "for inputs, targets in train_ds:\n", " print(\"inputs.shape:\", inputs.shape)\n", " print(\"inputs.dtype:\", inputs.dtype)\n", " print(\"targets.shape:\", targets.shape)\n", " print(\"targets.dtype:\", targets.dtype)\n", " print(\"inputs[0]:\", inputs[0])\n", " print(\"targets[0]:\", targets[0])\n", " break" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Processing words as a set: The bag-of-words approach" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Single words (unigrams) with binary encoding" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Preprocessing our datasets with a `TextVectorization` layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization = TextVectorization(\n", " max_tokens=20000,\n", " output_mode=\"multi_hot\",\n", ")\n", "text_only_train_ds = train_ds.map(lambda x, y: x)\n", "text_vectorization.adapt(text_only_train_ds)\n", "\n", "binary_1gram_train_ds = train_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "binary_1gram_val_ds = val_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "binary_1gram_test_ds = test_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Inspecting the output of our binary unigram dataset**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "for inputs, targets in binary_1gram_train_ds:\n", " print(\"inputs.shape:\", inputs.shape)\n", " print(\"inputs.dtype:\", inputs.dtype)\n", " print(\"targets.shape:\", targets.shape)\n", " print(\"targets.dtype:\", targets.dtype)\n", " print(\"inputs[0]:\", inputs[0])\n", " print(\"targets[0]:\", targets[0])\n", " break" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Our model-building utility**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "\n", "def get_model(max_tokens=20000, hidden_dim=16):\n", " inputs = keras.Input(shape=(max_tokens,))\n", " x = layers.Dense(hidden_dim, activation=\"relu\")(inputs)\n", " x = layers.Dropout(0.5)(x)\n", " outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", " model = keras.Model(inputs, outputs)\n", " model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", " return model" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Training and testing the binary unigram model**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "model = get_model()\n", "model.summary()\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"binary_1gram.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(binary_1gram_train_ds.cache(),\n", " validation_data=binary_1gram_val_ds.cache(),\n", " epochs=10,\n", " callbacks=callbacks)\n", "model = keras.models.load_model(\"binary_1gram.keras\")\n", "print(f\"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Bigrams with binary encoding" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Configuring the `TextVectorization` layer to return bigrams**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization = TextVectorization(\n", " ngrams=2,\n", " max_tokens=20000,\n", " output_mode=\"multi_hot\",\n", ")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Training and testing the binary bigram model**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization.adapt(text_only_train_ds)\n", "binary_2gram_train_ds = train_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "binary_2gram_val_ds = val_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "binary_2gram_test_ds = test_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "\n", "model = get_model()\n", "model.summary()\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"binary_2gram.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(binary_2gram_train_ds.cache(),\n", " validation_data=binary_2gram_val_ds.cache(),\n", " epochs=10,\n", " callbacks=callbacks)\n", "model = keras.models.load_model(\"binary_2gram.keras\")\n", "print(f\"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Bigrams with TF-IDF encoding" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Configuring the `TextVectorization` layer to return token counts**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization = TextVectorization(\n", " ngrams=2,\n", " max_tokens=20000,\n", " output_mode=\"count\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Configuring `TextVectorization` to return TF-IDF-weighted outputs**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization = TextVectorization(\n", " ngrams=2,\n", " max_tokens=20000,\n", " output_mode=\"tf_idf\",\n", ")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Training and testing the TF-IDF bigram model**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "text_vectorization.adapt(text_only_train_ds)\n", "\n", "tfidf_2gram_train_ds = train_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "tfidf_2gram_val_ds = val_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "tfidf_2gram_test_ds = test_ds.map(\n", " lambda x, y: (text_vectorization(x), y),\n", " num_parallel_calls=4)\n", "\n", "model = get_model()\n", "model.summary()\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"tfidf_2gram.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(tfidf_2gram_train_ds.cache(),\n", " validation_data=tfidf_2gram_val_ds.cache(),\n", " epochs=10,\n", " callbacks=callbacks)\n", "model = keras.models.load_model(\"tfidf_2gram.keras\")\n", "print(f\"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}\")" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "inputs = keras.Input(shape=(1,), dtype=\"string\")\n", "processed_inputs = text_vectorization(inputs)\n", "outputs = model(processed_inputs)\n", "inference_model = keras.Model(inputs, outputs)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import tensorflow as tf\n", "raw_text_data = tf.convert_to_tensor([\n", " [\"That was an excellent movie, I loved it.\"],\n", "])\n", "predictions = inference_model(raw_text_data)\n", "print(f\"{float(predictions[0] * 100):.2f} percent positive\")" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "chapter11_part01_introduction.i", "private_outputs": false, "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 0 }