{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**\n", "\n", "_This notebook contains all the sample code and solutions to the exercises in chapter 13._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", " \n", " \n", "
\n", " \"Open\n", " \n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Python ≥3.5 is required\n", "import sys\n", "assert sys.version_info >= (3, 5)\n", "\n", "# Is this notebook running on Colab or Kaggle?\n", "IS_COLAB = \"google.colab\" in sys.modules\n", "IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n", "\n", "if IS_COLAB or IS_KAGGLE:\n", " %pip install -q -U tfx\n", " print(\"You can safely ignore the package incompatibility errors.\")\n", "\n", "# Scikit-Learn ≥0.20 is required\n", "import sklearn\n", "assert sklearn.__version__ >= \"0.20\"\n", "\n", "# TensorFlow ≥2.0 is required\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "assert tf.__version__ >= \"2.0\"\n", "\n", "# Common imports\n", "import numpy as np\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "mpl.rc('axes', labelsize=14)\n", "mpl.rc('xtick', labelsize=12)\n", "mpl.rc('ytick', labelsize=12)\n", "\n", "# Where to save the figures\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"data\"\n", "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n", "os.makedirs(IMAGES_PATH, exist_ok=True)\n", "\n", "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", " path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n", " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", " plt.savefig(path, format=fig_extension, dpi=resolution)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = tf.range(10)\n", "dataset = tf.data.Dataset.from_tensor_slices(X)\n", "dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Equivalently:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "dataset = tf.data.Dataset.range(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(0, shape=(), dtype=int64)\n", "tf.Tensor(1, shape=(), dtype=int64)\n", "tf.Tensor(2, shape=(), dtype=int64)\n", "tf.Tensor(3, shape=(), dtype=int64)\n", "tf.Tensor(4, shape=(), dtype=int64)\n", "tf.Tensor(5, shape=(), dtype=int64)\n", "tf.Tensor(6, shape=(), dtype=int64)\n", "tf.Tensor(7, shape=(), dtype=int64)\n", "tf.Tensor(8, shape=(), dtype=int64)\n", "tf.Tensor(9, shape=(), dtype=int64)\n" ] } ], "source": [ "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [ "raises-exception" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)\n", "tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)\n", "tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)\n", "tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)\n", "tf.Tensor([8 9], shape=(2,), dtype=int64)\n" ] } ], "source": [ "dataset = dataset.repeat(3).batch(7)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.map(lambda x: x * 2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([ 0 2 4 6 8 10 12], shape=(7,), dtype=int64)\n", "tf.Tensor([14 16 18 0 2 4 6], shape=(7,), dtype=int64)\n", "tf.Tensor([ 8 10 12 14 16 18 0], shape=(7,), dtype=int64)\n", "tf.Tensor([ 2 4 6 8 10 12 14], shape=(7,), dtype=int64)\n", "tf.Tensor([16 18], shape=(2,), dtype=int64)\n" ] } ], "source": [ "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#dataset = dataset.apply(tf.data.experimental.unbatch()) # Now deprecated\n", "dataset = dataset.unbatch()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.filter(lambda x: x < 10) # keep only items < 10" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(0, shape=(), dtype=int64)\n", "tf.Tensor(2, shape=(), dtype=int64)\n", "tf.Tensor(4, shape=(), dtype=int64)\n" ] } ], "source": [ "for item in dataset.take(3):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([1 3 0 4 2 5 6], shape=(7,), dtype=int64)\n", "tf.Tensor([8 7 1 0 3 2 5], shape=(7,), dtype=int64)\n", "tf.Tensor([4 6 9 8 9 7 0], shape=(7,), dtype=int64)\n", "tf.Tensor([3 1 4 5 2 8 7], shape=(7,), dtype=int64)\n", "tf.Tensor([6 9], shape=(2,), dtype=int64)\n" ] } ], "source": [ "tf.random.set_seed(42)\n", "\n", "dataset = tf.data.Dataset.range(10).repeat(3)\n", "dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split the California dataset to multiple CSV files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set, and finally we scale it:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_california_housing\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "housing = fetch_california_housing()\n", "X_train_full, X_test, y_train_full, y_test = train_test_split(\n", " housing.data, housing.target.reshape(-1, 1), random_state=42)\n", "X_train, X_valid, y_train, y_valid = train_test_split(\n", " X_train_full, y_train_full, random_state=42)\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(X_train)\n", "X_mean = scaler.mean_\n", "X_std = scaler.scale_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For a very large dataset that does not fit in memory, you will typically want to split it into many files first, then have TensorFlow read these files in parallel. To demonstrate this, let's start by splitting the housing dataset and save it to 20 CSV files:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n", " housing_dir = os.path.join(\"datasets\", \"housing\")\n", " os.makedirs(housing_dir, exist_ok=True)\n", " path_format = os.path.join(housing_dir, \"my_{}_{:02d}.csv\")\n", "\n", " filepaths = []\n", " m = len(data)\n", " for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):\n", " part_csv = path_format.format(name_prefix, file_idx)\n", " filepaths.append(part_csv)\n", " with open(part_csv, \"wt\", encoding=\"utf-8\") as f:\n", " if header is not None:\n", " f.write(header)\n", " f.write(\"\\n\")\n", " for row_idx in row_indices:\n", " f.write(\",\".join([repr(col) for col in data[row_idx]]))\n", " f.write(\"\\n\")\n", " return filepaths" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train_data = np.c_[X_train, y_train]\n", "valid_data = np.c_[X_valid, y_valid]\n", "test_data = np.c_[X_test, y_test]\n", "header_cols = housing.feature_names + [\"MedianHouseValue\"]\n", "header = \",\".join(header_cols)\n", "\n", "train_filepaths = save_to_multiple_csv_files(train_data, \"train\", header, n_parts=20)\n", "valid_filepaths = save_to_multiple_csv_files(valid_data, \"valid\", header, n_parts=10)\n", "test_filepaths = save_to_multiple_csv_files(test_data, \"test\", header, n_parts=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Okay, now let's take a peek at the first few lines of one of these CSV files:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeMedianHouseValue
03.521415.03.0499451.1065481447.01.60599337.63-122.431.442
15.32755.06.4900600.9910543464.03.44334033.69-117.391.687
23.100029.07.5423731.5915251328.02.25084738.44-122.981.621
37.173612.06.2890030.9974421054.02.69565233.55-117.702.621
42.054913.05.3124571.0850923297.02.24438433.93-116.930.956
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 3.5214 15.0 3.049945 1.106548 1447.0 1.605993 37.63 \n", "1 5.3275 5.0 6.490060 0.991054 3464.0 3.443340 33.69 \n", "2 3.1000 29.0 7.542373 1.591525 1328.0 2.250847 38.44 \n", "3 7.1736 12.0 6.289003 0.997442 1054.0 2.695652 33.55 \n", "4 2.0549 13.0 5.312457 1.085092 3297.0 2.244384 33.93 \n", "\n", " Longitude MedianHouseValue \n", "0 -122.43 1.442 \n", "1 -117.39 1.687 \n", "2 -122.98 1.621 \n", "3 -117.70 2.621 \n", "4 -116.93 0.956 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "pd.read_csv(train_filepaths[0]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or in text mode:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue\n", "3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442\n", "5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687\n", "3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621\n", "7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621\n" ] } ], "source": [ "with open(train_filepaths[0]) as f:\n", " for i in range(5):\n", " print(f.readline(), end=\"\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['datasets/housing/my_train_00.csv',\n", " 'datasets/housing/my_train_01.csv',\n", " 'datasets/housing/my_train_02.csv',\n", " 'datasets/housing/my_train_03.csv',\n", " 'datasets/housing/my_train_04.csv',\n", " 'datasets/housing/my_train_05.csv',\n", " 'datasets/housing/my_train_06.csv',\n", " 'datasets/housing/my_train_07.csv',\n", " 'datasets/housing/my_train_08.csv',\n", " 'datasets/housing/my_train_09.csv',\n", " 'datasets/housing/my_train_10.csv',\n", " 'datasets/housing/my_train_11.csv',\n", " 'datasets/housing/my_train_12.csv',\n", " 'datasets/housing/my_train_13.csv',\n", " 'datasets/housing/my_train_14.csv',\n", " 'datasets/housing/my_train_15.csv',\n", " 'datasets/housing/my_train_16.csv',\n", " 'datasets/housing/my_train_17.csv',\n", " 'datasets/housing/my_train_18.csv',\n", " 'datasets/housing/my_train_19.csv']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_filepaths" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building an Input Pipeline" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_08.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_03.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_04.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_11.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_18.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_06.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)\n" ] } ], "source": [ "for filepath in filepath_dataset:\n", " print(filepath)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "n_readers = 5\n", "dataset = filepath_dataset.interleave(\n", " lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n", " cycle_length=n_readers)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'4.6477,38.0,5.03728813559322,0.911864406779661,745.0,2.5254237288135593,32.64,-117.07,1.504'\n", "b'8.72,44.0,6.163179916317992,1.0460251046025104,668.0,2.794979079497908,34.2,-118.18,4.159'\n", "b'3.8456,35.0,5.461346633416459,0.9576059850374065,1154.0,2.8778054862842892,37.96,-122.05,1.598'\n", "b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526'\n", "b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625'\n" ] } ], "source": [ "for line in dataset.take(5):\n", " print(line.numpy())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that field 4 is interpreted as a string." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), \"Hello\", tf.constant([])]\n", "parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)\n", "parsed_fields" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that all missing fields are replaced with their default value, when provided:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)\n", "parsed_fields" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The 5th field is compulsory (since we provided `tf.constant([])` as the \"default value\"), so we get an exception if we do not provide it:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Field 4 is required but missing in record 0! [Op:DecodeCSV]\n" ] } ], "source": [ "try:\n", " parsed_fields = tf.io.decode_csv(',,,,', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The number of fields should match exactly the number of fields in the `record_defaults`:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]\n" ] } ], "source": [ "try:\n", " parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "n_inputs = 8 # X_train.shape[-1]\n", "\n", "@tf.function\n", "def preprocess(line):\n", " defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]\n", " fields = tf.io.decode_csv(line, record_defaults=defs)\n", " x = tf.stack(fields[:-1])\n", " y = tf.stack(fields[-1:])\n", " return (x - X_mean) / X_std, y" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(,\n", " )" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def csv_reader_dataset(filepaths, repeat=1, n_readers=5,\n", " n_read_threads=None, shuffle_buffer_size=10000,\n", " n_parse_threads=5, batch_size=32):\n", " dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)\n", " dataset = dataset.interleave(\n", " lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n", " cycle_length=n_readers, num_parallel_calls=n_read_threads)\n", " dataset = dataset.shuffle(shuffle_buffer_size)\n", " dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n", " dataset = dataset.batch(batch_size)\n", " return dataset.prefetch(1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X = tf.Tensor(\n", "[[ 0.5804519 -0.20762321 0.05616303 -0.15191229 0.01343246 0.00604472\n", " 1.2525111 -1.3671792 ]\n", " [ 5.818099 1.8491895 1.1784915 0.28173092 -1.2496178 -0.3571987\n", " 0.7231292 -1.0023477 ]\n", " [-0.9253566 0.5834586 -0.7807257 -0.28213993 -0.36530012 0.27389365\n", " -0.76194876 0.72684526]], shape=(3, 8), dtype=float32)\n", "y = tf.Tensor(\n", "[[1.752]\n", " [1.313]\n", " [1.535]], shape=(3, 1), dtype=float32)\n", "\n", "X = tf.Tensor(\n", "[[-0.8324941 0.6625668 -0.20741376 -0.18699841 -0.14536144 0.09635526\n", " 0.9807942 -0.67250353]\n", " [-0.62183803 0.5834586 -0.19862501 -0.3500319 -1.1437552 -0.3363751\n", " 1.107282 -0.8674123 ]\n", " [ 0.8683102 0.02970133 0.3427381 -0.29872298 0.7124906 0.28026953\n", " -0.72915536 0.86178064]], shape=(3, 8), dtype=float32)\n", "y = tf.Tensor(\n", "[[0.919]\n", " [1.028]\n", " [2.182]], shape=(3, 1), dtype=float32)\n", "\n" ] } ], "source": [ "tf.random.set_seed(42)\n", "\n", "train_set = csv_reader_dataset(train_filepaths, batch_size=3)\n", "for X_batch, y_batch in train_set.take(2):\n", " print(\"X =\", X_batch)\n", " print(\"y =\", y_batch)\n", " print()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "train_set = csv_reader_dataset(train_filepaths, repeat=None)\n", "valid_set = csv_reader_dataset(valid_filepaths)\n", "test_set = csv_reader_dataset(test_filepaths)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)\n", "\n", "model = keras.models.Sequential([\n", " keras.layers.Dense(30, activation=\"relu\", input_shape=X_train.shape[1:]),\n", " keras.layers.Dense(1),\n", "])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(learning_rate=1e-3))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "362/362 [==============================] - 1s 3ms/step - loss: 2.0914 - val_loss: 21.5124\n", "Epoch 2/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.8428 - val_loss: 0.6648\n", "Epoch 3/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.6329 - val_loss: 0.6196\n", "Epoch 4/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.5922 - val_loss: 0.5669\n", "Epoch 5/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.5622 - val_loss: 0.5402\n", "Epoch 6/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.5698 - val_loss: 0.5209\n", "Epoch 7/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.5195 - val_loss: 0.6130\n", "Epoch 8/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.5155 - val_loss: 0.4818\n", "Epoch 9/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.4965 - val_loss: 0.4904\n", "Epoch 10/10\n", "362/362 [==============================] - 0s 1ms/step - loss: 0.4925 - val_loss: 0.4585\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 32\n", "model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,\n", " validation_data=valid_set)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "161/161 [==============================] - 0s 589us/step - loss: 0.4788\n" ] }, { "data": { "text/plain": [ "0.4787752032279968" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(test_set, steps=len(X_test) // batch_size)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[2.3576407],\n", " [2.255291 ],\n", " [1.4437605],\n", " ...,\n", " [0.5654393],\n", " [3.9442453],\n", " [1.0232248]], dtype=float32)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_set = test_set.map(lambda X, y: X) # we could instead just pass test_set, Keras would ignore the labels\n", "X_new = X_test\n", "model.predict(new_set, steps=len(X_new) // batch_size)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Global step 1810/1810" ] } ], "source": [ "optimizer = keras.optimizers.Nadam(learning_rate=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "n_epochs = 5\n", "batch_size = 32\n", "n_steps_per_epoch = len(X_train) // batch_size\n", "total_steps = n_epochs * n_steps_per_epoch\n", "global_step = 0\n", "for X_batch, y_batch in train_set.take(total_steps):\n", " global_step += 1\n", " print(\"\\rGlobal step {}/{}\".format(global_step, total_steps), end=\"\")\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "optimizer = keras.optimizers.Nadam(learning_rate=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "@tf.function\n", "def train(model, n_epochs, batch_size=32,\n", " n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n", " train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n", " n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n", " n_parse_threads=n_parse_threads, batch_size=batch_size)\n", " for X_batch, y_batch in train_set:\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", "\n", "train(model, 5)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Global step 100 / 1810\n", "Global step 200 / 1810\n", "Global step 300 / 1810\n", "Global step 400 / 1810\n", "Global step 500 / 1810\n", "Global step 600 / 1810\n", "Global step 700 / 1810\n", "Global step 800 / 1810\n", "Global step 900 / 1810\n", "Global step 1000 / 1810\n", "Global step 1100 / 1810\n", "Global step 1200 / 1810\n", "Global step 1300 / 1810\n", "Global step 1400 / 1810\n", "Global step 1500 / 1810\n", "Global step 1600 / 1810\n", "Global step 1700 / 1810\n", "Global step 1800 / 1810\n" ] } ], "source": [ "optimizer = keras.optimizers.Nadam(learning_rate=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "@tf.function\n", "def train(model, n_epochs, batch_size=32,\n", " n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n", " train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n", " n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n", " n_parse_threads=n_parse_threads, batch_size=batch_size)\n", " n_steps_per_epoch = len(X_train) // batch_size\n", " total_steps = n_epochs * n_steps_per_epoch\n", " global_step = 0\n", " for X_batch, y_batch in train_set.take(total_steps):\n", " global_step += 1\n", " if tf.equal(global_step % 100, 0):\n", " tf.print(\"\\rGlobal step\", global_step, \"/\", total_steps)\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", "\n", "train(model, 5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a short description of each method in the `Dataset` class:" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "● apply() Applies a transformation function to this dataset.\n", "● as_numpy_iterator() Returns an iterator which converts all elements of the dataset to numpy.\n", "● batch() Combines consecutive elements of this dataset into batches.\n", "● cache() Caches the elements in this dataset.\n", "● cardinality() Returns the cardinality of the dataset, if known.\n", "● concatenate() Creates a `Dataset` by concatenating the given dataset with this dataset.\n", "● element_spec() The type specification of an element of this dataset.\n", "● enumerate() Enumerates the elements of this dataset.\n", "● filter() Filters this dataset according to `predicate`.\n", "● flat_map() Maps `map_func` across this dataset and flattens the result.\n", "● from_generator() Creates a `Dataset` whose elements are generated by `generator`. (deprecated arguments)\n", "● from_tensor_slices() Creates a `Dataset` whose elements are slices of the given tensors.\n", "● from_tensors() Creates a `Dataset` with a single element, comprising the given tensors.\n", "● interleave() Maps `map_func` across this dataset, and interleaves the results.\n", "● list_files() A dataset of all files matching one or more glob patterns.\n", "● map() Maps `map_func` across the elements of this dataset.\n", "● options() Returns the options for this dataset and its inputs.\n", "● padded_batch() Combines consecutive elements of this dataset into padded batches.\n", "● prefetch() Creates a `Dataset` that prefetches elements from this dataset.\n", "● range() Creates a `Dataset` of a step-separated range of values.\n", "● reduce() Reduces the input dataset to a single element.\n", "● repeat() Repeats this dataset so each original value is seen `count` times.\n", "● shard() Creates a `Dataset` that includes only 1/`num_shards` of this dataset.\n", "● shuffle() Randomly shuffles the elements of this dataset.\n", "● skip() Creates a `Dataset` that skips `count` elements from this dataset.\n", "● take() Creates a `Dataset` with at most `count` elements from this dataset.\n", "● unbatch() Splits elements of a dataset into multiple elements.\n", "● window() Combines (nests of) input elements into a dataset of (nests of) windows.\n", "● with_options() Returns a new `tf.data.Dataset` with the given options set.\n", "● zip() Creates a `Dataset` by zipping together the given datasets.\n" ] } ], "source": [ "for m in dir(tf.data.Dataset):\n", " if not (m.startswith(\"_\") or m.endswith(\"_\")):\n", " func = getattr(tf.data.Dataset, m)\n", " if hasattr(func, \"__doc__\"):\n", " print(\"● {:21s}{}\".format(m + \"()\", func.__doc__.split(\"\\n\")[0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `TFRecord` binary format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A TFRecord file is just a list of binary records. You can create one using a `tf.io.TFRecordWriter`:" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "with tf.io.TFRecordWriter(\"my_data.tfrecord\") as f:\n", " f.write(b\"This is the first record\")\n", " f.write(b\"And this is the second record\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And you can read it using a `tf.data.TFRecordDataset`:" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'This is the first record', shape=(), dtype=string)\n", "tf.Tensor(b'And this is the second record', shape=(), dtype=string)\n" ] } ], "source": [ "filepaths = [\"my_data.tfrecord\"]\n", "dataset = tf.data.TFRecordDataset(filepaths)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can read multiple TFRecord files with just one `TFRecordDataset`. By default it will read them one at a time, but if you set `num_parallel_reads=3`, it will read 3 at a time in parallel and interleave their records:" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'File 0 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 0 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 0 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 2', shape=(), dtype=string)\n" ] } ], "source": [ "filepaths = [\"my_test_{}.tfrecord\".format(i) for i in range(5)]\n", "for i, filepath in enumerate(filepaths):\n", " with tf.io.TFRecordWriter(filepath) as f:\n", " for j in range(3):\n", " f.write(\"File {} record {}\".format(i, j).encode(\"utf-8\"))\n", "\n", "dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "options = tf.io.TFRecordOptions(compression_type=\"GZIP\")\n", "with tf.io.TFRecordWriter(\"my_compressed.tfrecord\", options) as f:\n", " f.write(b\"This is the first record\")\n", " f.write(b\"And this is the second record\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'This is the first record', shape=(), dtype=string)\n", "tf.Tensor(b'And this is the second record', shape=(), dtype=string)\n" ] } ], "source": [ "dataset = tf.data.TFRecordDataset([\"my_compressed.tfrecord\"],\n", " compression_type=\"GZIP\")\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A Brief Intro to Protocol Buffers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For this section you need to [install protobuf](https://developers.google.com/protocol-buffers/docs/downloads). In general you will not have to do so when using TensorFlow, as it comes with functions to create and parse protocol buffers of type `tf.train.Example`, which are generally sufficient. However, in this section we will learn about protocol buffers by creating our own simple protobuf definition, so we need the protobuf compiler (`protoc`): we will use it to compile the protobuf definition to a Python module that we can then use in our code." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First let's write a simple protobuf definition:" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting person.proto\n" ] } ], "source": [ "%%writefile person.proto\n", "syntax = \"proto3\";\n", "message Person {\n", " string name = 1;\n", " int32 id = 2;\n", " repeated string email = 3;\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And let's compile it (the `--descriptor_set_out` and `--include_imports` options are only required for the `tf.io.decode_proto()` example below):" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "!protoc person.proto --python_out=. --descriptor_set_out=person.desc --include_imports" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "person.desc person.proto person_pb2.py\n" ] } ], "source": [ "!ls person*" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name: \"Al\"\n", "id: 123\n", "email: \"a@b.com\"\n", "\n" ] } ], "source": [ "from person_pb2 import Person\n", "\n", "person = Person(name=\"Al\", id=123, email=[\"a@b.com\"]) # create a Person\n", "print(person) # display the Person" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Al'" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person.name # read a field" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "person.name = \"Alice\" # modify a field" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a@b.com'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person.email[0] # repeated fields can be accessed like arrays" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "person.email.append(\"c@d.com\") # add an email address" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'\\n\\x05Alice\\x10{\\x1a\\x07a@b.com\\x1a\\x07c@d.com'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = person.SerializeToString() # serialize to a byte string\n", "s" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person2 = Person() # create a new Person\n", "person2.ParseFromString(s) # parse the byte string (27 bytes)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person == person2 # now they are equal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Custom protobuf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In rare cases, you may want to parse a custom protobuf (like the one we just created) in TensorFlow. For this you can use the `tf.io.decode_proto()` function:" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person_tf = tf.io.decode_proto(\n", " bytes=s,\n", " message_type=\"Person\",\n", " field_names=[\"name\", \"id\", \"email\"],\n", " output_types=[tf.string, tf.int32, tf.string],\n", " descriptor_source=\"person.desc\")\n", "\n", "person_tf.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more details, see the [`tf.io.decode_proto()`](https://www.tensorflow.org/api_docs/python/tf/io/decode_proto) documentation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TensorFlow Protobufs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is the definition of the tf.train.Example protobuf:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```proto\n", "syntax = \"proto3\";\n", "\n", "message BytesList { repeated bytes value = 1; }\n", "message FloatList { repeated float value = 1 [packed = true]; }\n", "message Int64List { repeated int64 value = 1 [packed = true]; }\n", "message Feature {\n", " oneof kind {\n", " BytesList bytes_list = 1;\n", " FloatList float_list = 2;\n", " Int64List int64_list = 3;\n", " }\n", "};\n", "message Features { map feature = 1; };\n", "message Example { Features features = 1; };\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning**: in TensorFlow 2.0 and 2.1, there was a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details." ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "#from tensorflow.train import BytesList, FloatList, Int64List\n", "#from tensorflow.train import Feature, Features, Example\n", "BytesList = tf.train.BytesList\n", "FloatList = tf.train.FloatList\n", "Int64List = tf.train.Int64List\n", "Feature = tf.train.Feature\n", "Features = tf.train.Features\n", "Example = tf.train.Example\n", "\n", "person_example = Example(\n", " features=Features(\n", " feature={\n", " \"name\": Feature(bytes_list=BytesList(value=[b\"Alice\"])),\n", " \"id\": Feature(int64_list=Int64List(value=[123])),\n", " \"emails\": Feature(bytes_list=BytesList(value=[b\"a@b.com\", b\"c@d.com\"]))\n", " }))\n", "\n", "with tf.io.TFRecordWriter(\"my_contacts.tfrecord\") as f:\n", " f.write(person_example.SerializeToString())" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "feature_description = {\n", " \"name\": tf.io.FixedLenFeature([], tf.string, default_value=\"\"),\n", " \"id\": tf.io.FixedLenFeature([], tf.int64, default_value=0),\n", " \"emails\": tf.io.VarLenFeature(tf.string),\n", "}\n", "for serialized_example in tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]):\n", " parsed_example = tf.io.parse_single_example(serialized_example,\n", " feature_description)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'emails': ,\n", " 'id': ,\n", " 'name': }" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'emails': ,\n", " 'id': ,\n", " 'name': }" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example[\"emails\"].values[0]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.sparse.to_dense(parsed_example[\"emails\"], default_value=b\"\")" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example[\"emails\"].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Putting Images in TFRecords" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.datasets import load_sample_images\n", "\n", "img = load_sample_images()[\"images\"][0]\n", "plt.imshow(img)\n", "plt.axis(\"off\")\n", "plt.title(\"Original Image\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "data = tf.io.encode_jpeg(img)\n", "example_with_image = Example(features=Features(feature={\n", " \"image\": Feature(bytes_list=BytesList(value=[data.numpy()]))}))\n", "serialized_example = example_with_image.SerializeToString()\n", "# then save to TFRecord" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "feature_description = { \"image\": tf.io.VarLenFeature(tf.string) }\n", "example_with_image = tf.io.parse_single_example(serialized_example, feature_description)\n", "decoded_img = tf.io.decode_jpeg(example_with_image[\"image\"].values[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or use `decode_image()` which supports BMP, GIF, JPEG and PNG formats:" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "decoded_img = tf.io.decode_image(example_with_image[\"image\"].values[0])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.imshow(decoded_img)\n", "plt.title(\"Decoded Image\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Putting Tensors and Sparse Tensors in TFRecords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tensors can be serialized and parsed easily using `tf.io.serialize_tensor()` and `tf.io.parse_tensor()`:" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = tf.constant([[0., 1.], [2., 3.], [4., 5.]])\n", "s = tf.io.serialize_tensor(t)\n", "s" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.io.parse_tensor(s, out_type=tf.float32)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "serialized_sparse = tf.io.serialize_sparse(parsed_example[\"emails\"])\n", "serialized_sparse" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "value: \"\\010\\t\\022\\010\\022\\002\\010\\002\\022\\002\\010\\001\\\"\\020\\000\\000\\000\\000\\000\\000\\000\\000\\001\\000\\000\\000\\000\\000\\000\\000\"\n", "value: \"\\010\\007\\022\\004\\022\\002\\010\\002\\\"\\020\\007\\007a@b.comc@d.com\"\n", "value: \"\\010\\t\\022\\004\\022\\002\\010\\001\\\"\\010\\002\\000\\000\\000\\000\\000\\000\\000\"" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BytesList(value=serialized_sparse.numpy())" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "dataset = tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]).batch(10)\n", "for serialized_examples in dataset:\n", " parsed_examples = tf.io.parse_example(serialized_examples,\n", " feature_description)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'image': }" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Handling Sequential Data Using `SequenceExample`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```proto\n", "syntax = \"proto3\";\n", "\n", "message FeatureList { repeated Feature feature = 1; };\n", "message FeatureLists { map feature_list = 1; };\n", "message SequenceExample {\n", " Features context = 1;\n", " FeatureLists feature_lists = 2;\n", "};\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning**: in TensorFlow 2.0 and 2.1, there was a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details." ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "#from tensorflow.train import FeatureList, FeatureLists, SequenceExample\n", "FeatureList = tf.train.FeatureList\n", "FeatureLists = tf.train.FeatureLists\n", "SequenceExample = tf.train.SequenceExample\n", "\n", "context = Features(feature={\n", " \"author_id\": Feature(int64_list=Int64List(value=[123])),\n", " \"title\": Feature(bytes_list=BytesList(value=[b\"A\", b\"desert\", b\"place\", b\".\"])),\n", " \"pub_date\": Feature(int64_list=Int64List(value=[1623, 12, 25]))\n", "})\n", "\n", "content = [[\"When\", \"shall\", \"we\", \"three\", \"meet\", \"again\", \"?\"],\n", " [\"In\", \"thunder\", \",\", \"lightning\", \",\", \"or\", \"in\", \"rain\", \"?\"]]\n", "comments = [[\"When\", \"the\", \"hurlyburly\", \"'s\", \"done\", \".\"],\n", " [\"When\", \"the\", \"battle\", \"'s\", \"lost\", \"and\", \"won\", \".\"]]\n", "\n", "def words_to_feature(words):\n", " return Feature(bytes_list=BytesList(value=[word.encode(\"utf-8\")\n", " for word in words]))\n", "\n", "content_features = [words_to_feature(sentence) for sentence in content]\n", "comments_features = [words_to_feature(comment) for comment in comments]\n", " \n", "sequence_example = SequenceExample(\n", " context=context,\n", " feature_lists=FeatureLists(feature_list={\n", " \"content\": FeatureList(feature=content_features),\n", " \"comments\": FeatureList(feature=comments_features)\n", " }))" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "context {\n", " feature {\n", " key: \"author_id\"\n", " value {\n", " int64_list {\n", " value: 123\n", " }\n", " }\n", " }\n", " feature {\n", " key: \"pub_date\"\n", " value {\n", " int64_list {\n", " value: 1623\n", " value: 12\n", " value: 25\n", " }\n", " }\n", " }\n", " feature {\n", " key: \"title\"\n", " value {\n", " bytes_list {\n", " value: \"A\"\n", " value: \"desert\"\n", " value: \"place\"\n", " value: \".\"\n", " }\n", " }\n", " }\n", "}\n", "feature_lists {\n", " feature_list {\n", " key: \"comments\"\n", " value {\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"the\"\n", " value: \"hurlyburly\"\n", " value: \"\\'s\"\n", " value: \"done\"\n", " value: \".\"\n", " }\n", " }\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"the\"\n", " value: \"battle\"\n", " value: \"\\'s\"\n", " value: \"lost\"\n", " value: \"and\"\n", " value: \"won\"\n", " value: \".\"\n", " }\n", " }\n", " }\n", " }\n", " feature_list {\n", " key: \"content\"\n", " value {\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"shall\"\n", " value: \"we\"\n", " value: \"three\"\n", " value: \"meet\"\n", " value: \"again\"\n", " value: \"?\"\n", " }\n", " }\n", " feature {\n", " bytes_list {\n", " value: \"In\"\n", " value: \"thunder\"\n", " value: \",\"\n", " value: \"lightning\"\n", " value: \",\"\n", " value: \"or\"\n", " value: \"in\"\n", " value: \"rain\"\n", " value: \"?\"\n", " }\n", " }\n", " }\n", " }\n", "}" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sequence_example" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "serialized_sequence_example = sequence_example.SerializeToString()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "context_feature_descriptions = {\n", " \"author_id\": tf.io.FixedLenFeature([], tf.int64, default_value=0),\n", " \"title\": tf.io.VarLenFeature(tf.string),\n", " \"pub_date\": tf.io.FixedLenFeature([3], tf.int64, default_value=[0, 0, 0]),\n", "}\n", "sequence_feature_descriptions = {\n", " \"content\": tf.io.VarLenFeature(tf.string),\n", " \"comments\": tf.io.VarLenFeature(tf.string),\n", "}\n", "parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(\n", " serialized_sequence_example, context_feature_descriptions,\n", " sequence_feature_descriptions)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'title': ,\n", " 'author_id': ,\n", " 'pub_date': }" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_context" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_context[\"title\"].values" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'comments': ,\n", " 'content': }" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_feature_lists" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(tf.RaggedTensor.from_sparse(parsed_feature_lists[\"content\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The Features API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's use the variant of the California housing dataset that we used in Chapter 2, since it contains categorical features and missing values:" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "import os\n", "import tarfile\n", "import urllib.request\n", "\n", "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", "HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n", "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n", "\n", "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", " os.makedirs(housing_path, exist_ok=True)\n", " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", " urllib.request.urlretrieve(housing_url, tgz_path)\n", " housing_tgz = tarfile.open(tgz_path)\n", " housing_tgz.extractall(path=housing_path)\n", " housing_tgz.close()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "fetch_housing_data()" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def load_housing_data(housing_path=HOUSING_PATH):\n", " csv_path = os.path.join(housing_path, \"housing.csv\")\n", " return pd.read_csv(csv_path)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.8841.0880.0129.0322.0126.08.3252452600.0NEAR BAY
1-122.2237.8621.07099.01106.02401.01138.08.3014358500.0NEAR BAY
2-122.2437.8552.01467.0190.0496.0177.07.2574352100.0NEAR BAY
3-122.2537.8552.01274.0235.0558.0219.05.6431341300.0NEAR BAY
4-122.2537.8552.01627.0280.0565.0259.03.8462342200.0NEAR BAY
\n", "
" ], "text/plain": [ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", "0 -122.23 37.88 41.0 880.0 129.0 \n", "1 -122.22 37.86 21.0 7099.0 1106.0 \n", "2 -122.24 37.85 52.0 1467.0 190.0 \n", "3 -122.25 37.85 52.0 1274.0 235.0 \n", "4 -122.25 37.85 52.0 1627.0 280.0 \n", "\n", " population households median_income median_house_value ocean_proximity \n", "0 322.0 126.0 8.3252 452600.0 NEAR BAY \n", "1 2401.0 1138.0 8.3014 358500.0 NEAR BAY \n", "2 496.0 177.0 7.2574 352100.0 NEAR BAY \n", "3 558.0 219.0 5.6431 341300.0 NEAR BAY \n", "4 565.0 259.0 3.8462 342200.0 NEAR BAY " ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "housing = load_housing_data()\n", "housing.head()" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "age_mean, age_std = X_mean[1], X_std[1] # The median age is column in 1\n", "housing_median_age = tf.feature_column.numeric_column(\n", " \"housing_median_age\", normalizer_fn=lambda x: (x - age_mean) / age_std)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "median_income = tf.feature_column.numeric_column(\"median_income\")\n", "bucketized_income = tf.feature_column.bucketized_column(\n", " median_income, boundaries=[1.5, 3., 4.5, 6.])" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3.0, 4.5, 6.0))" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bucketized_income" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']\n", "ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(\n", " \"ocean_proximity\", ocean_prox_vocab)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ocean_proximity" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "HashedCategoricalColumn(key='city', hash_bucket_size=1000, dtype=tf.string)" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Just an example, it's not used later on\n", "city_hash = tf.feature_column.categorical_column_with_hash_bucket(\n", " \"city\", hash_bucket_size=1000)\n", "city_hash" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "bucketized_age = tf.feature_column.bucketized_column(\n", " housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled\n", "age_and_ocean_proximity = tf.feature_column.crossed_column(\n", " [bucketized_age, ocean_proximity], hash_bucket_size=100)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "latitude = tf.feature_column.numeric_column(\"latitude\")\n", "longitude = tf.feature_column.numeric_column(\"longitude\")\n", "bucketized_latitude = tf.feature_column.bucketized_column(\n", " latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))\n", "bucketized_longitude = tf.feature_column.bucketized_column(\n", " longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))\n", "location = tf.feature_column.crossed_column(\n", " [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,\n", " dimension=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Feature Columns for Parsing" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "median_house_value = tf.feature_column.numeric_column(\"median_house_value\")" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),\n", " 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [housing_median_age, median_house_value]\n", "feature_descriptions = tf.feature_column.make_parse_example_spec(columns)\n", "feature_descriptions" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "with tf.io.TFRecordWriter(\"my_data_with_features.tfrecords\") as f:\n", " for x, y in zip(X_train[:, 1:2], y_train):\n", " example = Example(features=Features(feature={\n", " \"housing_median_age\": Feature(float_list=FloatList(value=[x])),\n", " \"median_house_value\": Feature(float_list=FloatList(value=[y]))\n", " }))\n", " f.write(example.SerializeToString())" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "def parse_examples(serialized_examples):\n", " examples = tf.io.parse_example(serialized_examples, feature_descriptions)\n", " targets = examples.pop(\"median_house_value\") # separate the targets\n", " return examples, targets\n", "\n", "batch_size = 32\n", "dataset = tf.data.TFRecordDataset([\"my_data_with_features.tfrecords\"])\n", "dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning**: the `DenseFeatures` layer currently does not work with the Functional API, see [TF issue #27416](https://github.com/tensorflow/tensorflow/issues/27416). Hopefully this will be resolved before the final release of TF 2.0." ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a input: {'housing_median_age': }\n", "Consider rewriting this model with the Functional API.\n", "WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a input: {'housing_median_age': }\n", "Consider rewriting this model with the Functional API.\n", "362/362 [==============================] - 0s 675us/step - loss: 4.7553 - accuracy: 8.8428e-04\n", "Epoch 2/5\n", "362/362 [==============================] - 0s 622us/step - loss: 2.1622 - accuracy: 0.0021\n", "Epoch 3/5\n", "362/362 [==============================] - 0s 583us/step - loss: 1.4673 - accuracy: 0.0032\n", "Epoch 4/5\n", "362/362 [==============================] - 0s 543us/step - loss: 1.3786 - accuracy: 0.0033\n", "Epoch 5/5\n", "362/362 [==============================] - 0s 537us/step - loss: 1.3404 - accuracy: 0.0034\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_without_target = columns[:-1]\n", "model = keras.models.Sequential([\n", " keras.layers.DenseFeatures(feature_columns=columns_without_target),\n", " keras.layers.Dense(1)\n", "])\n", "model.compile(loss=\"mse\",\n", " optimizer=keras.optimizers.SGD(learning_rate=1e-3),\n", " metrics=[\"accuracy\"])\n", "model.fit(dataset, steps_per_epoch=len(X_train) // batch_size, epochs=5)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "some_columns = [ocean_proximity_embed, bucketized_income]\n", "dense_features = keras.layers.DenseFeatures(some_columns)\n", "dense_features({\n", " \"ocean_proximity\": [[\"NEAR OCEAN\"], [\"INLAND\"], [\"INLAND\"]],\n", " \"median_income\": [[3.], [7.2], [1.]]\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TF Transform" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "try:\n", " import tensorflow_transform as tft\n", "\n", " def preprocess(inputs): # inputs is a batch of input features\n", " median_age = inputs[\"housing_median_age\"]\n", " ocean_proximity = inputs[\"ocean_proximity\"]\n", " standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))\n", " ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)\n", " return {\n", " \"standardized_median_age\": standardized_age,\n", " \"ocean_proximity_id\": ocean_proximity_id\n", " }\n", "except ImportError:\n", " print(\"TF Transform is not installed. Try running: pip3 install -U tensorflow-transform\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TensorFlow Datasets" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Found a different version of the requested dataset:\n", "/Users/ageron/tensorflow_datasets/mnist/3.0.0\n", "Using /Users/ageron/tensorflow_datasets/mnist/3.0.1 instead.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /Users/ageron/tensorflow_datasets/mnist/3.0.1...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:Dataset mnist is hosted on GCS. It will automatically be downloaded to your\n", "local data directory. If you'd instead prefer to read directly from our public\n", "GCS bucket (recommended if you're running on GCP), you can instead set\n", "data_dir=gs://tfds-data/datasets.\n", "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "766787dcfced4b7db1d4d66559378f1b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Dl Completed...: 0%| | 0/4 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(6,3))\n", "mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)\n", "for item in mnist_train:\n", " images = item[\"image\"]\n", " labels = item[\"label\"]\n", " for index in range(5):\n", " plt.subplot(1, 5, index + 1)\n", " image = images[index, ..., 0]\n", " label = labels[index].numpy()\n", " plt.imshow(image, cmap=\"binary\")\n", " plt.title(label)\n", " plt.axis(\"off\")\n", " break # just showing part of the first batch" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(32, 28, 28, 1)\n", "[4 1 0 7 8 1 2 7 1 6 6 4 7 7 3 3 7 9 9 1 0 6 6 9 9 4 8 9 4 7 3 3]\n" ] } ], "source": [ "datasets = tfds.load(name=\"mnist\")\n", "mnist_train, mnist_test = datasets[\"train\"], datasets[\"test\"]\n", "mnist_train = mnist_train.repeat(5).batch(32)\n", "mnist_train = mnist_train.map(lambda items: (items[\"image\"], items[\"label\"]))\n", "mnist_train = mnist_train.prefetch(1)\n", "for images, labels in mnist_train.take(1):\n", " print(images.shape)\n", " print(labels.numpy())" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "1875/1875 [==============================] - 2s 997us/step - loss: 42.8499 - accuracy: 0.8034\n", "Epoch 2/5\n", "1875/1875 [==============================] - 1s 481us/step - loss: 25.1669 - accuracy: 0.8687\n", "Epoch 3/5\n", "1875/1875 [==============================] - 1s 460us/step - loss: 24.1730 - accuracy: 0.8744\n", "Epoch 4/5\n", "1875/1875 [==============================] - 1s 446us/step - loss: 23.7216 - accuracy: 0.8760\n", "Epoch 5/5\n", "1875/1875 [==============================] - 1s 444us/step - loss: 23.1382 - accuracy: 0.8786\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets = tfds.load(name=\"mnist\", batch_size=32, as_supervised=True)\n", "mnist_train = datasets[\"train\"].repeat().prefetch(1)\n", "model = keras.models.Sequential([\n", " keras.layers.Flatten(input_shape=[28, 28, 1]),\n", " keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)),\n", " keras.layers.Dense(10, activation=\"softmax\")])\n", "model.compile(loss=\"sparse_categorical_crossentropy\",\n", " optimizer=keras.optimizers.SGD(learning_rate=1e-3),\n", " metrics=[\"accuracy\"])\n", "model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TensorFlow Hub" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "keras_layer (KerasLayer) (None, 50) 48190600 \n", "_________________________________________________________________\n", "dense (Dense) (None, 16) 816 \n", "_________________________________________________________________\n", "dense_1 (Dense) (None, 1) 17 \n", "=================================================================\n", "Total params: 48,191,433\n", "Trainable params: 833\n", "Non-trainable params: 48,190,600\n", "_________________________________________________________________\n" ] } ], "source": [ "import tensorflow_hub as hub\n", "\n", "hub_layer = hub.KerasLayer(\"https://tfhub.dev/google/nnlm-en-dim50/2\",\n", " output_shape=[50], input_shape=[], dtype=tf.string)\n", "\n", "model = keras.Sequential()\n", "model.add(hub_layer)\n", "model.add(keras.layers.Dense(16, activation='relu'))\n", "model.add(keras.layers.Dense(1, activation='sigmoid'))\n", "\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "sentences = tf.constant([\"It was a great movie\", \"The actors were amazing\"])\n", "embeddings = hub_layer(sentences)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercises\n", "\n", "## 1. to 8.\n", "See Appendix A\n", "\n", "## 9.\n", "### a.\n", "_Exercise: Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a training set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized `Example` protobuf with two features: the serialized image (use `tf.io.serialize_tensor()` to serialize each image), and the label. Note: for large images, you could use `tf.io.encode_jpeg()` instead. This would save a lot of space, but it would lose a bit of image quality._" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()\n", "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n", "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "np.random.seed(42)\n", "tf.random.set_seed(42)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))\n", "valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))\n", "test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "def create_example(image, label):\n", " image_data = tf.io.serialize_tensor(image)\n", " #image_data = tf.io.encode_jpeg(image[..., np.newaxis])\n", " return Example(\n", " features=Features(\n", " feature={\n", " \"image\": Feature(bytes_list=BytesList(value=[image_data.numpy()])),\n", " \"label\": Feature(int64_list=Int64List(value=[label])),\n", " }))" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "features {\n", " feature {\n", " key: \"image\"\n", " value {\n", " bytes_list {\n", " valuerI\\000\\000\\001\\004\\000\\000\\000\\000\\001\\001\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\003\\000$\\210\\177>6\\000\\000\\000\\001\\003\\004\\000\\000\\003\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\006\\000f\\314\\260\\206\\220{\\027\\000\\000\\000\\000\\014\\n\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\233\\354\\317\\262k\\234\\241m@\\027M\\202H\\017\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\001\\000E\\317\\337\\332\\330\\330\\243\\177yz\\222\\215X\\254B\\000\\000\\000\\000\\000\\000\\000\\000\\000\\001\\001\\001\\000\\310\\350\\350\\351\\345\\337\\337\\327\\325\\244\\177{\\304\\345\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\267\\341\\330\\337\\344\\353\\343\\340\\336\\340\\335\\337\\365\\255\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\301\\344\\332\\325\\306\\264\\324\\322\\323\\325\\337\\334\\363\\312\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\001\\003\\000\\014\\333\\334\\324\\332\\300\\251\\343\\320\\332\\340\\324\\342\\305\\3214\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\006\\000c\\364\\336\\334\\332\\313\\306\\335\\327\\325\\336\\334\\365w\\2478\\000\\000\\000\\000\\000\\000\\000\\000\\000\\004\\000\\0007\\354\\344\\346\\344\\360\\350\\325\\332\\337\\352\\331\\331\\321\\\\\\000\\000\\000\\001\\004\\006\\007\\002\\000\\000\\000\\000\\000\\355\\342\\331\\337\\336\\333\\336\\335\\330\\337\\345\\327\\332\\377M\\000\\000\\003\\000\\000\\000\\000\\000\\000\\000>\\221\\314\\344\\317\\325\\335\\332\\320\\323\\332\\340\\337\\333\\327\\340\\364\\237\\000\\000\\000\\000\\000\\022,Rk\\275\\344\\334\\336\\331\\342\\310\\315\\323\\346\\340\\352\\260\\274\\372\\370\\351\\356\\327\\000\\0009\\273\\320\\340\\335\\340\\320\\314\\326\\320\\321\\310\\237\\365\\301\\316\\337\\377\\377\\335\\352\\335\\323\\334\\350\\366\\000\\003\\312\\344\\340\\335\\323\\323\\326\\315\\315\\315\\334\\360P\\226\\377\\345\\335\\274\\232\\277\\322\\314\\321\\336\\344\\341\\000b\\351\\306\\322\\336\\345\\345\\352\\371\\334\\302\\327\\331\\361AIju\\250\\333\\335\\327\\331\\337\\337\\340\\345\\035K\\314\\324\\314\\301\\315\\323\\341\\330\\271\\305\\316\\306\\325\\360\\303\\343\\365\\357\\337\\332\\324\\321\\336\\334\\335\\346C0\\313\\267\\302\\325\\305\\271\\276\\302\\300\\312\\326\\333\\335\\334\\354\\341\\330\\307\\316\\272\\265\\261\\254\\265\\315\\316s\\000z\\333\\301\\263\\253\\267\\304\\314\\322\\325\\317\\323\\322\\310\\304\\302\\277\\303\\277\\306\\300\\260\\234\\247\\261\\322\\\\\\000\\000J\\275\\324\\277\\257\\254\\257\\265\\271\\274\\275\\274\\301\\306\\314\\321\\322\\322\\323\\274\\274\\302\\300\\330\\252\\000\\002\\000\\000\\000B\\310\\336\\355\\357\\362\\366\\363\\364\\335\\334\\301\\277\\263\\266\\266\\265\\260\\246\\250c:\\000\\000\\000\\000\\000\\000\\000\\000\\000(=,H)#\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\\000\"\n", " }\n", " }\n", " }\n", " feature {\n", " key: \"label\"\n", " value {\n", " int64_list {\n", " value: 9\n", " }\n", " }\n", " }\n", "}\n", "\n" ] } ], "source": [ "for image, label in valid_set.take(1):\n", " print(create_example(image, label))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following function saves a given dataset to a set of TFRecord files. The examples are written to the files in a round-robin fashion. To do this, we enumerate all the examples using the `dataset.enumerate()` method, and we compute `index % n_shards` to decide which file to write to. We use the standard `contextlib.ExitStack` class to make sure that all writers are properly closed whether or not an I/O error occurs while writing." ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "from contextlib import ExitStack\n", "\n", "def write_tfrecords(name, dataset, n_shards=10):\n", " paths = [\"{}.tfrecord-{:05d}-of-{:05d}\".format(name, index, n_shards)\n", " for index in range(n_shards)]\n", " with ExitStack() as stack:\n", " writers = [stack.enter_context(tf.io.TFRecordWriter(path))\n", " for path in paths]\n", " for index, (image, label) in dataset.enumerate():\n", " shard = index % n_shards\n", " example = create_example(image, label)\n", " writers[shard].write(example.SerializeToString())\n", " return paths" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "train_filepaths = write_tfrecords(\"my_fashion_mnist.train\", train_set)\n", "valid_filepaths = write_tfrecords(\"my_fashion_mnist.valid\", valid_set)\n", "test_filepaths = write_tfrecords(\"my_fashion_mnist.test\", test_set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### b.\n", "_Exercise: Then use tf.data to create an efficient dataset for each set. Finally, use a Keras model to train these datasets, including a preprocessing layer to standardize each input feature. Try to make the input pipeline as efficient as possible, using TensorBoard to visualize profiling data._" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "def preprocess(tfrecord):\n", " feature_descriptions = {\n", " \"image\": tf.io.FixedLenFeature([], tf.string, default_value=\"\"),\n", " \"label\": tf.io.FixedLenFeature([], tf.int64, default_value=-1)\n", " }\n", " example = tf.io.parse_single_example(tfrecord, feature_descriptions)\n", " image = tf.io.parse_tensor(example[\"image\"], out_type=tf.uint8)\n", " #image = tf.io.decode_jpeg(example[\"image\"])\n", " image = tf.reshape(image, shape=[28, 28])\n", " return image, example[\"label\"]\n", "\n", "def mnist_dataset(filepaths, n_read_threads=5, shuffle_buffer_size=None,\n", " n_parse_threads=5, batch_size=32, cache=True):\n", " dataset = tf.data.TFRecordDataset(filepaths,\n", " num_parallel_reads=n_read_threads)\n", " if cache:\n", " dataset = dataset.cache()\n", " if shuffle_buffer_size:\n", " dataset = dataset.shuffle(shuffle_buffer_size)\n", " dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n", " dataset = dataset.batch(batch_size)\n", " return dataset.prefetch(1)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "train_set = mnist_dataset(train_filepaths, shuffle_buffer_size=60000)\n", "valid_set = mnist_dataset(valid_filepaths)\n", "test_set = mnist_dataset(test_filepaths)" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "for X, y in train_set.take(1):\n", " for i in range(5):\n", " plt.subplot(1, 5, i + 1)\n", " plt.imshow(X[i].numpy(), cmap=\"binary\")\n", " plt.axis(\"off\")\n", " plt.title(str(y[i].numpy()))" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "keras.backend.clear_session()\n", "tf.random.set_seed(42)\n", "np.random.seed(42)\n", "\n", "class Standardization(keras.layers.Layer):\n", " def adapt(self, data_sample):\n", " self.means_ = np.mean(data_sample, axis=0, keepdims=True)\n", " self.stds_ = np.std(data_sample, axis=0, keepdims=True)\n", " def call(self, inputs):\n", " return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())\n", "\n", "standardization = Standardization(input_shape=[28, 28])\n", "# or perhaps soon:\n", "#standardization = keras.layers.Normalization()\n", "\n", "sample_image_batches = train_set.take(100).map(lambda image, label: image)\n", "sample_images = np.concatenate(list(sample_image_batches.as_numpy_iterator()),\n", " axis=0).astype(np.float32)\n", "standardization.adapt(sample_images)\n", "\n", "model = keras.models.Sequential([\n", " standardization,\n", " keras.layers.Flatten(),\n", " keras.layers.Dense(100, activation=\"relu\"),\n", " keras.layers.Dense(10, activation=\"softmax\")\n", "])\n", "model.compile(loss=\"sparse_categorical_crossentropy\",\n", " optimizer=\"nadam\", metrics=[\"accuracy\"])" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "1719/1719 [==============================] - 4s 2ms/step - loss: 656.6687 - accuracy: 0.8038 - val_loss: 82.8087 - val_accuracy: 0.8806\n", "Epoch 2/5\n", "1719/1719 [==============================] - 4s 2ms/step - loss: 209.0040 - accuracy: 0.8781 - val_loss: 147.8434 - val_accuracy: 0.8906\n", "Epoch 3/5\n", "1719/1719 [==============================] - 3s 2ms/step - loss: 146.5866 - accuracy: 0.8914 - val_loss: 361.5933 - val_accuracy: 0.9058\n", "Epoch 4/5\n", "1719/1719 [==============================] - 3s 2ms/step - loss: 110.8240 - accuracy: 0.9014 - val_loss: 150.4520 - val_accuracy: 0.9143\n", "Epoch 5/5\n", "1719/1719 [==============================] - 3s 2ms/step - loss: 175.6303 - accuracy: 0.9106 - val_loss: 42.5092 - val_accuracy: 0.9141\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datetime import datetime\n", "logs = os.path.join(os.curdir, \"my_logs\",\n", " \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\"))\n", "\n", "tensorboard_cb = tf.keras.callbacks.TensorBoard(\n", " log_dir=logs, histogram_freq=1, profile_batch=10)\n", "\n", "model.fit(train_set, epochs=5, validation_data=valid_set,\n", " callbacks=[tensorboard_cb])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning:** The profiling tab in TensorBoard works if you use TensorFlow 2.2+. You also need to make sure `tensorboard_plugin_profile` is installed (and restart Jupyter if necessary)." ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%load_ext tensorboard\n", "%tensorboard --logdir=./my_logs --port=6006" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10.\n", "_Exercise: In this exercise you will download a dataset, split it, create a `tf.data.Dataset` to load it and preprocess it efficiently, then build and train a binary classification model containing an `Embedding` layer._\n", "\n", "### a.\n", "_Exercise: Download the [Large Movie Review Dataset](https://homl.info/imdb), which contains 50,000 movies reviews from the [Internet Movie Database](https://imdb.com/). The data is organized in two directories, `train` and `test`, each containing a `pos` subdirectory with 12,500 positive reviews and a `neg` subdirectory with 12,500 negative reviews. Each review is stored in a separate text file. There are other files and folders (including preprocessed bag-of-words), but we will ignore them in this exercise._" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", "84131840/84125825 [==============================] - 12s 0us/step\n" ] }, { "data": { "text/plain": [ "PosixPath('/Users/ageron/.keras/datasets/aclImdb')" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pathlib import Path\n", "\n", "DOWNLOAD_ROOT = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n", "FILENAME = \"aclImdb_v1.tar.gz\"\n", "filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)\n", "path = Path(filepath).parent / \"aclImdb\"\n", "path" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "aclImdb/\n", " README\n", " imdb.vocab\n", " imdbEr.txt\n", " test/\n", " labeledBow.feat\n", " urls_neg.txt\n", " urls_pos.txt\n", " neg/\n", " 0_2.txt\n", " 10000_4.txt\n", " 10001_1.txt\n", " ...\n", " pos/\n", " 0_10.txt\n", " 10000_7.txt\n", " 10001_9.txt\n", " ...\n", " train/\n", " labeledBow.feat\n", " unsupBow.feat\n", " urls_neg.txt\n", " ...\n", " neg/\n", " 0_3.txt\n", " 10000_4.txt\n", " 10001_4.txt\n", " ...\n", " unsup/\n", " 0_0.txt\n", " 10000_0.txt\n", " 10001_0.txt\n", " ...\n", " pos/\n", " 0_9.txt\n", " 10000_8.txt\n", " 10001_10.txt\n", " ...\n" ] } ], "source": [ "for name, subdirs, files in os.walk(path):\n", " indent = len(Path(name).parts) - len(path.parts)\n", " print(\" \" * indent + Path(name).parts[-1] + os.sep)\n", " for index, filename in enumerate(sorted(files)):\n", " if index == 3:\n", " print(\" \" * (indent + 1) + \"...\")\n", " break\n", " print(\" \" * (indent + 1) + filename)" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12500, 12500, 12500, 12500)" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def review_paths(dirpath):\n", " return [str(path) for path in dirpath.glob(\"*.txt\")]\n", "\n", "train_pos = review_paths(path / \"train\" / \"pos\")\n", "train_neg = review_paths(path / \"train\" / \"neg\")\n", "test_valid_pos = review_paths(path / \"test\" / \"pos\")\n", "test_valid_neg = review_paths(path / \"test\" / \"neg\")\n", "\n", "len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### b.\n", "_Exercise: Split the test set into a validation set (15,000) and a test set (10,000)._" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "np.random.shuffle(test_valid_pos)\n", "\n", "test_pos = test_valid_pos[:5000]\n", "test_neg = test_valid_neg[:5000]\n", "valid_pos = test_valid_pos[5000:]\n", "valid_neg = test_valid_neg[5000:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### c.\n", "_Exercise: Use tf.data to create an efficient dataset for each set._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since the dataset fits in memory, we can just load all the data using pure Python code and use `tf.data.Dataset.from_tensor_slices()`:" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "def imdb_dataset(filepaths_positive, filepaths_negative):\n", " reviews = []\n", " labels = []\n", " for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):\n", " for filepath in filepaths:\n", " with open(filepath) as review_file:\n", " reviews.append(review_file.read())\n", " labels.append(label)\n", " return tf.data.Dataset.from_tensor_slices(\n", " (tf.constant(reviews), tf.constant(labels)))" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b\"Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.

Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.\", shape=(), dtype=string)\n", "tf.Tensor(0, shape=(), dtype=int32)\n", "\n", "tf.Tensor(b'Well...tremors I, the original started off in 1990 and i found the movie quite enjoyable to watch. however, they proceeded to make tremors II and III. Trust me, those movies started going downhill right after they finished the first one, i mean, ass blasters??? Now, only God himself is capable of answering the question \"why in Gods name would they create another one of these dumpster dives of a movie?\" Tremors IV cannot be considered a bad movie, in fact it cannot be even considered an epitome of a bad movie, for it lives up to more than that. As i attempted to sit though it, i noticed that my eyes started to bleed, and i hoped profusely that the little girl from the ring would crawl through the TV and kill me. did they really think that dressing the people who had stared in the other movies up as though they we\\'re from the wild west would make the movie (with the exact same occurrences) any better? honestly, i would never suggest buying this movie, i mean, there are cheaper ways to find things that burn well.', shape=(), dtype=string)\n", "tf.Tensor(0, shape=(), dtype=int32)\n", "\n", "tf.Tensor(b\"Ouch! This one was a bit painful to sit through. It has a cute and amusing premise, but it all goes to hell from there. Matthew Modine is almost always pedestrian and annoying, and he does not disappoint in this one. Deborah Kara Unger and John Neville turned in surprisingly decent performances. Alan Bates and Jennifer Tilly, among others, played it way over the top. I know that's the way the parts were written, and it's hard to blame actors, when the script and director have them do such schlock. If you're going to have outrageous characters, that's OK, but you gotta have good material to make it work. It didn't here. Run away screaming from this movie if at all possible.\", shape=(), dtype=string)\n", "tf.Tensor(0, shape=(), dtype=int32)\n", "\n" ] } ], "source": [ "for X, y in imdb_dataset(train_pos, train_neg).take(3):\n", " print(X)\n", " print(y)\n", " print()" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "17.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It takes about 17 seconds to load the dataset and go through it 10 times." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But let's pretend the dataset does not fit in memory, just to make things more interesting. Luckily, each review fits on just one line (they use `
` to indicate line breaks), so we can read the reviews using a `TextLineDataset`. If they didn't we would have to preprocess the input files (e.g., converting them to TFRecords). For very large datasets, it would make sense to use a tool like Apache Beam for that." ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):\n", " dataset_neg = tf.data.TextLineDataset(filepaths_negative,\n", " num_parallel_reads=n_read_threads)\n", " dataset_neg = dataset_neg.map(lambda review: (review, 0))\n", " dataset_pos = tf.data.TextLineDataset(filepaths_positive,\n", " num_parallel_reads=n_read_threads)\n", " dataset_pos = dataset_pos.map(lambda review: (review, 1))\n", " return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now it takes about 33 seconds to go through the dataset 10 times. That's much slower, essentially because the dataset is not cached in RAM, so it must be reloaded at each epoch. If you add `.cache()` just before `.repeat(10)`, you will see that this implementation will be about as fast as the previous one." ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).cache().repeat(10): pass" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "batch_size = 32\n", "\n", "train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)\n", "valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)\n", "test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### d.\n", "_Exercise: Create a binary classification model, using a `TextVectorization` layer to preprocess each review. If the `TextVectorization` layer is not yet available (or if you like a challenge), try to create your own custom preprocessing layer: you can use the functions in the `tf.strings` package, for example `lower()` to make everything lowercase, `regex_replace()` to replace punctuation with spaces, and `split()` to split words on spaces. You should use a lookup table to output word indices, which must be prepared in the `adapt()` method._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's first write a function to preprocess the reviews, cropping them to 300 characters, converting them to lower case, then replacing `
` and all non-letter characters to spaces, splitting the reviews into words, and finally padding or cropping each review so it ends up with exactly `n_words` tokens:" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b''],\n", " [b'it', b'was', b'terrible', b'run', b'away', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'', b'', b'', b'', b'', b'',\n", " b'']], dtype=object)>" ] }, "execution_count": 142, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def preprocess(X_batch, n_words=50):\n", " shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])\n", " Z = tf.strings.substr(X_batch, 0, 300)\n", " Z = tf.strings.lower(Z)\n", " Z = tf.strings.regex_replace(Z, b\"\", b\" \")\n", " Z = tf.strings.regex_replace(Z, b\"[^a-z]\", b\" \")\n", " Z = tf.strings.split(Z)\n", " return Z.to_tensor(shape=shape, default_value=b\"\")\n", "\n", "X_example = tf.constant([\"It's a great, great movie! I loved it.\", \"It was terrible, run away!!!\"])\n", "preprocess(X_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's write a second utility function that will take a data sample with the same format as the output of the `preprocess()` function, and will output the list of the top `max_size` most frequent words, ensuring that the padding token is first:" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[b'',\n", " b'it',\n", " b'great',\n", " b's',\n", " b'a',\n", " b'movie',\n", " b'i',\n", " b'loved',\n", " b'was',\n", " b'terrible',\n", " b'run',\n", " b'away']" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter\n", "\n", "def get_vocabulary(data_sample, max_size=1000):\n", " preprocessed_reviews = preprocess(data_sample).numpy()\n", " counter = Counter()\n", " for words in preprocessed_reviews:\n", " for word in words:\n", " if word != b\"\":\n", " counter[word] += 1\n", " return [b\"\"] + [word for word, count in counter.most_common(max_size)]\n", "\n", "get_vocabulary(X_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are ready to create the `TextVectorization` layer. Its constructor just saves the hyperparameters (`max_vocabulary_size` and `n_oov_buckets`). The `adapt()` method computes the vocabulary using the `get_vocabulary()` function, then it builds a `StaticVocabularyTable` (see Chapter 16 for more details). The `call()` method preprocesses the reviews to get a padded list of words for each review, then it uses the `StaticVocabularyTable` to lookup the index of each word in the vocabulary:" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "class TextVectorization(keras.layers.Layer):\n", " def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):\n", " super().__init__(dtype=dtype, **kwargs)\n", " self.max_vocabulary_size = max_vocabulary_size\n", " self.n_oov_buckets = n_oov_buckets\n", "\n", " def adapt(self, data_sample):\n", " self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)\n", " words = tf.constant(self.vocab)\n", " word_ids = tf.range(len(self.vocab), dtype=tf.int64)\n", " vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n", " self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)\n", " \n", " def call(self, inputs):\n", " preprocessed_inputs = preprocess(inputs)\n", " return self.table.lookup(preprocessed_inputs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's try it on our small `X_example` we defined earlier:" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_vectorization = TextVectorization()\n", "\n", "text_vectorization.adapt(X_example)\n", "text_vectorization(X_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks good! As you can see, each review was cleaned up and tokenized, then each word was encoded as its index in the vocabulary (all the 0s correspond to the `` tokens).\n", "\n", "Now let's create another `TextVectorization` layer and let's adapt it to the full IMDB training set (if the training set did not fit in RAM, we could just use a smaller sample of the training set by calling `train_set.take(500)`):" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "max_vocabulary_size = 1000\n", "n_oov_buckets = 100\n", "\n", "sample_review_batches = train_set.map(lambda review, label: review)\n", "sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()),\n", " axis=0)\n", "\n", "text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets,\n", " input_shape=[])\n", "text_vectorization.adapt(sample_reviews)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's run it on the same `X_example`, just to make sure the word IDs are larger now, since the vocabulary is bigger:" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_vectorization(X_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Good! Now let's take a look at the first 10 words in the vocabulary:" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[b'', b'the', b'a', b'of', b'and', b'i', b'to', b'is', b'this', b'it']" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_vectorization.vocab[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "These are the most common words in the reviews." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now to build our model we will need to encode all these word IDs somehow. One approach is to create bags of words: for each review, and for each word in the vocabulary, we count the number of occurences of that word in the review. For example:" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])\n", "tf.reduce_sum(tf.one_hot(simple_example, 4), axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The first review has 2 times the word 0, 2 times the word 1, 0 times the word 2, and 1 time the word 3, so its bag-of-words representation is `[2, 2, 0, 1]`. Similarly, the second review has 3 times the word 0, 0 times the word 1, and so on. Let's wrap this logic in a small custom layer, and let's test it. We'll drop the counts for the word 0, since this corresponds to the `` token, which we don't care about." ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "class BagOfWords(keras.layers.Layer):\n", " def __init__(self, n_tokens, dtype=tf.int32, **kwargs):\n", " super().__init__(dtype=dtype, **kwargs)\n", " self.n_tokens = n_tokens\n", " def call(self, inputs):\n", " one_hot = tf.one_hot(inputs, self.n_tokens)\n", " return tf.reduce_sum(one_hot, axis=1)[:, 1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's test it:" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bag_of_words = BagOfWords(n_tokens=4)\n", "bag_of_words(simple_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It works fine! Now let's create another `BagOfWord` with the right vocabulary size for our training set:" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "n_tokens = max_vocabulary_size + n_oov_buckets + 1 # add 1 for \n", "bag_of_words = BagOfWords(n_tokens)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're ready to train the model!" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "782/782 [==============================] - 5s 5ms/step - loss: 0.5834 - accuracy: 0.6784 - val_loss: 0.5116 - val_accuracy: 0.7376\n", "Epoch 2/5\n", "782/782 [==============================] - 5s 5ms/step - loss: 0.4647 - accuracy: 0.7738 - val_loss: 0.4998 - val_accuracy: 0.7445\n", "Epoch 3/5\n", "782/782 [==============================] - 5s 5ms/step - loss: 0.4141 - accuracy: 0.8062 - val_loss: 0.5025 - val_accuracy: 0.7457\n", "Epoch 4/5\n", "782/782 [==============================] - 5s 5ms/step - loss: 0.3506 - accuracy: 0.8536 - val_loss: 0.5308 - val_accuracy: 0.7465\n", "Epoch 5/5\n", "782/782 [==============================] - 5s 5ms/step - loss: 0.2642 - accuracy: 0.9039 - val_loss: 0.5681 - val_accuracy: 0.7351\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = keras.models.Sequential([\n", " text_vectorization,\n", " bag_of_words,\n", " keras.layers.Dense(100, activation=\"relu\"),\n", " keras.layers.Dense(1, activation=\"sigmoid\"),\n", "])\n", "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", " metrics=[\"accuracy\"])\n", "model.fit(train_set, epochs=5, validation_data=valid_set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We get about 73.5% accuracy on the validation set after just the first epoch, but after that the model makes no significant progress. We will do better in Chapter 16. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### e.\n", "_Exercise: Add an `Embedding` layer and compute the mean embedding for each review, multiplied by the square root of the number of words (see Chapter 16). This rescaled mean embedding can then be passed to the rest of your model._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To compute the mean embedding for each review, and multiply it by the square root of the number of words in that review, we will need a little function. For each sentence, this function needs to compute $M \\times \\sqrt N$, where $M$ is the mean of all the word embeddings in the sentence (excluding padding tokens), and $N$ is the number of words in the sentence (also excluding padding tokens). We can rewrite $M$ as $\\dfrac{S}{N}$, where $S$ is the sum of all word embeddings (it does not matter whether or not we include the padding tokens in this sum, since their representation is a zero vector). So the function must return $M \\times \\sqrt N = \\dfrac{S}{N} \\times \\sqrt N = \\dfrac{S}{\\sqrt N \\times \\sqrt N} \\times \\sqrt N= \\dfrac{S}{\\sqrt N}$." ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def compute_mean_embedding(inputs):\n", " not_pad = tf.math.count_nonzero(inputs, axis=-1)\n", " n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True) \n", " sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))\n", " return tf.reduce_sum(inputs, axis=1) / sqrt_n_words\n", "\n", "another_example = tf.constant([[[1., 2., 3.], [4., 5., 0.], [0., 0., 0.]],\n", " [[6., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])\n", "compute_mean_embedding(another_example)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's check that this is correct. The first review contains 2 words (the last token is a zero vector, which represents the `` token). Let's compute the mean embedding for these 2 words, and multiply the result by the square root of 2:" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.reduce_mean(another_example[0:1, :2], axis=1) * tf.sqrt(2.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks good! Now let's check the second review, which contains just one word (we ignore the two padding tokens):" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.reduce_mean(another_example[1:2, :1], axis=1) * tf.sqrt(1.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Perfect. Now we're ready to train our final model. It's the same as before, except we replaced the `BagOfWords` layer with an `Embedding` layer followed by a `Lambda` layer that calls the `compute_mean_embedding` layer:" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [], "source": [ "embedding_size = 20\n", "\n", "model = keras.models.Sequential([\n", " text_vectorization,\n", " keras.layers.Embedding(input_dim=n_tokens,\n", " output_dim=embedding_size,\n", " mask_zero=True), # tokens => zero vectors\n", " keras.layers.Lambda(compute_mean_embedding),\n", " keras.layers.Dense(100, activation=\"relu\"),\n", " keras.layers.Dense(1, activation=\"sigmoid\"),\n", "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### f.\n", "_Exercise: Train the model and see what accuracy you get. Try to optimize your pipelines to make training as fast as possible._" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "782/782 [==============================] - 3s 2ms/step - loss: 0.6053 - accuracy: 0.6568 - val_loss: 0.5151 - val_accuracy: 0.7382\n", "Epoch 2/5\n", "782/782 [==============================] - 2s 2ms/step - loss: 0.4922 - accuracy: 0.7569 - val_loss: 0.5081 - val_accuracy: 0.7466\n", "Epoch 3/5\n", "782/782 [==============================] - 2s 2ms/step - loss: 0.4827 - accuracy: 0.7628 - val_loss: 0.4978 - val_accuracy: 0.7473\n", "Epoch 4/5\n", "782/782 [==============================] - 2s 2ms/step - loss: 0.4761 - accuracy: 0.7656 - val_loss: 0.4959 - val_accuracy: 0.7513\n", "Epoch 5/5\n", "782/782 [==============================] - 3s 2ms/step - loss: 0.4737 - accuracy: 0.7687 - val_loss: 0.4978 - val_accuracy: 0.7471\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n", "model.fit(train_set, epochs=5, validation_data=valid_set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The model is not better using embeddings (but we will do better in Chapter 16). The pipeline looks fast enough (we optimized it earlier)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### g.\n", "_Exercise: Use TFDS to load the same dataset more easily: `tfds.load(\"imdb_reviews\")`._" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "import tensorflow_datasets as tfds\n", "\n", "datasets = tfds.load(name=\"imdb_reviews\")\n", "train_set, test_set = datasets[\"train\"], datasets[\"test\"]" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b\"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.\", shape=(), dtype=string)\n", "tf.Tensor(0, shape=(), dtype=int64)\n" ] } ], "source": [ "for example in train_set.take(1):\n", " print(example[\"text\"])\n", " print(example[\"label\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "nav_menu": { "height": "264px", "width": "369px" }, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }