{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Keras-RNN-(Recurrent-Neural-Network)---Language-Model\" data-toc-modified-id=\"Keras-RNN-(Recurrent-Neural-Network)---Language-Model-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Keras RNN (Recurrent Neural Network) - Language Model</a></span><ul class=\"toc-item\"><li><span><a href=\"#Implementation\" data-toc-modified-id=\"Implementation-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Implementation</a></span></li></ul></li><li><span><a href=\"#Reference\" data-toc-modified-id=\"Reference-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Reference</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 340
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2891,
     "status": "ok",
     "timestamp": 1525625180709,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "cskEklatDPtU",
    "outputId": "5d4efe3e-975c-4b02-e1c1-cdf5ed278c05"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       "@import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
       "@import url('http://fonts.googleapis.com/css?family=Vollkorn');\n",
       "@import url('http://fonts.googleapis.com/css?family=Arimo');\n",
       "@import url('http://fonts.googleapis.com/css?family=Fira_sans');\n",
       "    \n",
       "    div.cell {\n",
       "        width: 1000px;\n",
       "        margin-left: 0% !important;\n",
       "        margin-right: auto;\n",
       "    }\n",
       "    div.text_cell code {\n",
       "        background: transparent;\n",
       "        color: #000000;\n",
       "        font-weight: 600;\n",
       "        font-size: 12pt;\n",
       "        font-style: bold;\n",
       "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
       "    }\n",
       "    h1 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "\t}\n",
       "\t\n",
       "    div.input_area {\n",
       "        background: #F6F6F9;\n",
       "        border: 1px solid #586e75;\n",
       "    }\n",
       "\n",
       "    .text_cell_render h1 {\n",
       "        font-weight: 200;\n",
       "        font-size: 30pt;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h2 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "        text-align: left;\n",
       "    }\n",
       "    .text_cell_render h2 {\n",
       "        font-weight: 200;\n",
       "        font-size: 16pt;\n",
       "        font-style: italic;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1.5em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h3 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h3 {\n",
       "        font-weight: 200;\n",
       "        font-size: 14pt;\n",
       "        line-height: 100%;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 2em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    }\n",
       "    h4 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h4 {\n",
       "        font-weight: 100;\n",
       "        font-size: 14pt;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    h5 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h5 {\n",
       "        font-weight: 200;\n",
       "        font-style: normal;\n",
       "        color: #1d3b84;\n",
       "        font-size: 16pt;\n",
       "        margin-bottom: 0em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    div.text_cell_render{\n",
       "        font-family: 'Fira sans', verdana,arial,sans-serif;\n",
       "        line-height: 125%;\n",
       "        font-size: 115%;\n",
       "        text-align:justify;\n",
       "        text-justify:inter-word;\n",
       "    }\n",
       "    div.output_wrapper{\n",
       "        margin-top:0.2em;\n",
       "        margin-bottom:0.2em;\n",
       "    }\n",
       "\n",
       "    code{\n",
       "      font-size: 70%;\n",
       "    }\n",
       "    .rendered_html code{\n",
       "    background-color: transparent;\n",
       "    }\n",
       "    ul{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li li{\n",
       "        padding-left: 0.2em; \n",
       "        margin-bottom: 0.2em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    ol{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ol li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    a:link{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:visited{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:hover{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:focus{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:active{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    .rendered_html :link {\n",
       "       text-decoration: underline; \n",
       "    }\n",
       "    .rendered_html :hover {\n",
       "       text-decoration: none; \n",
       "    }\n",
       "    .rendered_html :visited {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :focus {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :active {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .warning{\n",
       "        color: rgb( 240, 20, 20 )\n",
       "    } \n",
       "    hr {\n",
       "      color: #f3f3f3;\n",
       "      background-color: #f3f3f3;\n",
       "      height: 1px;\n",
       "    }\n",
       "    blockquote{\n",
       "      display:block;\n",
       "      background: #fcfcfc;\n",
       "      border-left: 5px solid #c76c0c;\n",
       "      font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "      width:680px;\n",
       "      padding: 10px 10px 10px 10px;\n",
       "      text-align:justify;\n",
       "      text-justify:inter-word;\n",
       "      }\n",
       "      blockquote p {\n",
       "        margin-bottom: 0;\n",
       "        line-height: 125%;\n",
       "        font-size: 100%;\n",
       "      }\n",
       "</style>\n",
       "<script>\n",
       "    MathJax.Hub.Config({\n",
       "                        TeX: {\n",
       "                           extensions: [\"AMSmath.js\"]\n",
       "                           },\n",
       "                tex2jax: {\n",
       "                    inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n",
       "                    displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n",
       "                },\n",
       "                displayAlign: 'center', // Change this to 'center' to center equations.\n",
       "                \"HTML-CSS\": {\n",
       "                    scale:100,\n",
       "                        availableFonts: [],\n",
       "                        preferredFont:null,\n",
       "                        webFont: \"TeX\",\n",
       "                    styles: {'.MathJax_Display': {\"margin\": 4}}\n",
       "                }\n",
       "        });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# code for loading the format for the notebook\n",
    "import os\n",
    "\n",
    "# path : store the current path to convert back to it later\n",
    "path = os.getcwd()\n",
    "os.chdir(os.path.join('..', 'notebook_format'))\n",
    "\n",
    "from formats import load_style\n",
    "load_style(plot_style = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 187
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1223,
     "status": "ok",
     "timestamp": 1525625181961,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "jknw1s2rGOAF",
    "outputId": "801b8d12-dcac-4703-962c-c3885a655981"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ethen 2018-05-07 20:18:55 \n",
      "\n",
      "CPython 3.6.4\n",
      "IPython 6.3.1\n",
      "\n",
      "keras 2.1.5\n",
      "numpy 1.14.3\n",
      "matplotlib 2.2.2\n",
      "tensorflow 1.7.0\n"
     ]
    }
   ],
   "source": [
    "os.chdir(path)\n",
    "\n",
    "import os\n",
    "import string\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from time import time\n",
    "from collections import Counter\n",
    "from keras.utils import to_categorical\n",
    "from keras.utils.data_utils import get_file\n",
    "from keras.models import Sequential, load_model\n",
    "from keras.layers import Embedding, LSTM, Dense\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "\n",
    "# 1. magic so that the notebook will reload external python modules\n",
    "# 2. magic for inline plot\n",
    "# 3. magic to enable retina (high resolution) plots\n",
    "# https://gist.github.com/minrk/3301035\n",
    "# 4. magic to print version\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "%load_ext watermark\n",
    "%watermark -a 'Ethen' -d -t -v -p keras,numpy,matplotlib,tensorflow"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Keras RNN (Recurrent Neural Network) - Language Model\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Language Modeling (LM) is one of the foundational task in the realm of natural language processing (NLP). At a high level, the goal is to predict the n + 1 token in a sequence given the n tokens preceding it. A well trained language model are used in applications such as machine translation, speech recognition or to be more concrete business applications such as Swiftkey.\n",
    "\n",
    "Language Model can operate either at the word level, sub-word level or character level, each having its own unique set of benefits and challenges. In practice word-level LMs tends to perform better than character-level LMs, but suffer from increased computational cost due to large vocabulary sizes. Apart from that it also requires more data preprocessing such as dealing with infrequent words and out of vocabulary words. On the other hand, character-level LMs do not face these issues as the vocabulary only consists of a limited set of characters. This, however, is not without drawbacks. Character-level LMs is more prone to vanishing gradient problems, as given a sentence \"I am happy\", a word-level LM would potentially treat this as 3 time steps (3 words/tokens), while a character-level LM would treat this as 8 time steps (8 characters), hence as the number of words/tokens in a sentence increase, the time step that the character-level LM needs to capture would be substantially higher than that of a word-level LM. To sum it up in one sentence. The distinction between word-level LMs and character-level LMs suggests that achieving state-of-art result for these two tasks often requires different network architectures and are usually not readily transferable."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Implementation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This documentation demonstrates the basic workflow of:\n",
    "\n",
    "- Preparing text for developing a word-level language model.\n",
    "- Train an neural network that contains an embedding and LSTM layer then used the learned model to generate new text with similar properties as the input text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "FZSYHYjNG1_d"
   },
   "outputs": [],
   "source": [
    "def elapsed(sec):\n",
    "    \"\"\"\n",
    "    Converts elapsed time into a more human readable format.\n",
    "\n",
    "    Examples\n",
    "    --------\n",
    "    from time import time\n",
    "\n",
    "    start = time()\n",
    "    # do something that's worth timing, like training a model\n",
    "    elapse = time() - start\n",
    "    elapsed(elapse)\n",
    "    \"\"\"\n",
    "    if sec < 60:\n",
    "        return str(sec) + ' seconds'\n",
    "    elif sec < (60 * 60):\n",
    "        return str(sec / 60) + ' minutes'\n",
    "    else:\n",
    "        return str(sec / (60 * 60)) + ' hours'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 170
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 554,
     "status": "ok",
     "timestamp": 1525625183502,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "b0xuoXHyHXZi",
    "outputId": "643b866b-a771-4a4c-aef5-9c322ff50226"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt\n",
      "606208/600901 [==============================] - 1s 2us/step\n",
      "corpus length: 600893\n",
      "example text: PREFACE\n",
      "\n",
      "\n",
      "SUPPOSING that Truth is a woman--what then? Is there not ground\n",
      "for suspecting that all philosophers, in so far as they have been\n",
      "dogmatists\n"
     ]
    }
   ],
   "source": [
    "path = get_file('nietzsche.txt', origin = 'https://s3.amazonaws.com/text-datasets/nietzsche.txt')\n",
    "with open(path, encoding = 'utf-8') as f:\n",
    "    raw_text = f.read()\n",
    "\n",
    "print('corpus length:', len(raw_text))\n",
    "print('example text:', raw_text[:150])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As with all text analysis, there are many preprocessing steps that needs to be done to make the corpus more ready for downstream modeling, here we'll stick to some really basic ones as this is not the main focus here. Steps includes:\n",
    "\n",
    "- We will be splitting the text into words/tokens based on spaces, and from the first few words, we can see that some words are separated by \"--\", hence we'll replace that with a space.\n",
    "- Removing punctuation marks and retain only alphabetical words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 395,
     "status": "ok",
     "timestamp": 1525625186624,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "9I-kJbNDYk6s",
    "outputId": "35f8bf90-05b9-476b-fb7c-e0ba75172341"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sampled original text:  ['PREFACE', 'SUPPOSING', 'that', 'Truth', 'is', 'a', 'woman', 'what', 'then?', 'Is']\n",
      "sampled cleaned text:  ['preface', 'supposing', 'that', 'truth', 'is', 'a', 'woman', 'what', 'then', 'is']\n"
     ]
    }
   ],
   "source": [
    "# ideally, we would save the cleaned text, to prevent\n",
    "# doing this step every single time\n",
    "tokens = raw_text.replace('--', ' ').split()\n",
    "cleaned_tokens = []\n",
    "table = str.maketrans('', '', string.punctuation)\n",
    "for word in tokens:\n",
    "    word = word.translate(table)\n",
    "    if word.isalpha():\n",
    "        cleaned_tokens.append(word.lower())\n",
    "\n",
    "print('sampled original text: ', tokens[:10])\n",
    "print('sampled cleaned text: ', cleaned_tokens[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The next step is to map each distinct word into integer so we can convert words into integers and feed them into our model later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 338,
     "status": "ok",
     "timestamp": 1525625188800,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "_8D5pIXuHhpg",
    "outputId": "beea10f2-611a-40e2-e250-502cd75801c5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocabulary size:  5090\n",
      "filtered words:  5097\n"
     ]
    }
   ],
   "source": [
    "# build up vocabulary,\n",
    "# rare words will also be considered out of vocabulary words,\n",
    "# this will be represented by an unknown token\n",
    "min_count = 2\n",
    "unknown_token = '<unk>'\n",
    "word2index = {unknown_token: 0}\n",
    "index2word = [unknown_token]\n",
    "\n",
    "filtered_words = 0\n",
    "counter = Counter(cleaned_tokens)\n",
    "for word, count in counter.items():\n",
    "    if count >= min_count:\n",
    "        index2word.append(word)\n",
    "        word2index[word] = len(word2index)\n",
    "    else:\n",
    "        filtered_words += 1\n",
    "\n",
    "num_classes = len(word2index)\n",
    "print('vocabulary size: ', num_classes)\n",
    "print('filtered words: ', filtered_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Recall that a language model's task is to take $n$ words and predict the $n + 1$ word, hence a key design decision is how long the input sequence should be. There is no one size fits all solution to this problem. Here, we will split them into sub-sequences with a fixed length of 40 and map the original word to indices."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 102
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1242,
     "status": "ok",
     "timestamp": 1525625192008,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "yPahaVJsH8ld",
    "outputId": "a5c37cd6-8079-472a-b158-46bb1bbbc17f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sequence dimension:  (33342, 40)\n",
      "target dimension:  (33342, 5090)\n",
      "example sequence:\n",
      " [ 1  2  3  4  5  6  7  8  9  5 10 11 12 13  0  3 14 15 16 17 18 19 20 21\n",
      " 22 23 21 24 25 26 27  3 28 29 30 31 32  0 33 34]\n"
     ]
    }
   ],
   "source": [
    "# create semi-overlapping sequences of words with\n",
    "# a fixed length specified by the maxlen parameter\n",
    "step = 3\n",
    "maxlen = 40\n",
    "X = []\n",
    "y = []\n",
    "for i in range(0, len(cleaned_tokens) - maxlen, step):\n",
    "    sentence = cleaned_tokens[i:i + maxlen]\n",
    "    next_word = cleaned_tokens[i + maxlen]\n",
    "    X.append([word2index.get(word, 0) for word in sentence])\n",
    "    y.append(word2index.get(next_word, 0))\n",
    "\n",
    "# keras expects the target to be in one-hot encoded format,\n",
    "# ideally we would use a generator that performs this conversion\n",
    "# only on the batch of data that is currently required by the model\n",
    "# to be more memory-efficient\n",
    "X = np.array(X)\n",
    "Y = to_categorical(y, num_classes)\n",
    "print('sequence dimension: ', X.shape)\n",
    "print('target dimension: ', Y.shape)\n",
    "print('example sequence:\\n', X[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 255
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 647,
     "status": "ok",
     "timestamp": 1525625194078,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "ee6rX-SdIdo-",
    "outputId": "d441541c-4046-495d-e9b4-86094723733d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 40, 50)            254500    \n",
      "_________________________________________________________________\n",
      "lstm_1 (LSTM)                (None, 256)               314368    \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 5090)              1308130   \n",
      "=================================================================\n",
      "Total params: 1,876,998\n",
      "Trainable params: 1,876,998\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# define the network architecture: a embedding followed by LSTM\n",
    "embedding_size = 50\n",
    "lstm_size = 256\n",
    "model1 = Sequential()\n",
    "model1.add(Embedding(num_classes, embedding_size, input_length = maxlen))\n",
    "model1.add(LSTM(lstm_size))\n",
    "model1.add(Dense(num_classes, activation = 'softmax'))\n",
    "model1.compile(loss = 'categorical_crossentropy', optimizer = 'adam')\n",
    "print(model1.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 323
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 714724,
     "status": "ok",
     "timestamp": 1525625919606,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "saitx4cKIdxK",
    "outputId": "9726a8b7-b3d1-4a18-dbc1-cd53db08bfd7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model checkpoint address:  lstm_weights1.hdf5\n",
      "Train on 26673 samples, validate on 6669 samples\n",
      "Epoch 1/40\n",
      "18080/26673 [===================>..........] - ETA: 31s - loss: 6.428226673/26673 [==============================] - 102s 4ms/step - loss: 6.3431 - val_loss: 6.2005\n",
      "Epoch 2/40\n",
      "14656/26673 [===============>..............] - ETA: 42s - loss: 5.826126673/26673 [==============================] - 101s 4ms/step - loss: 5.8272 - val_loss: 6.1464\n",
      "Epoch 3/40\n",
      "13312/26673 [=============>................] - ETA: 48s - loss: 5.595326673/26673 [==============================] - 102s 4ms/step - loss: 5.5813 - val_loss: 6.2016\n",
      "Epoch 4/40\n",
      "12800/26673 [=============>................] - ETA: 50s - loss: 5.303026673/26673 [==============================] - 103s 4ms/step - loss: 5.3414 - val_loss: 6.2657\n",
      "Epoch 5/40\n",
      "12576/26673 [=============>................] - ETA: 51s - loss: 5.020926673/26673 [==============================] - 103s 4ms/step - loss: 5.0778 - val_loss: 6.3739\n",
      "Epoch 6/40\n",
      "12512/26673 [=============>................] - ETA: 49s - loss: 4.694426673/26673 [==============================] - 100s 4ms/step - loss: 4.7756 - val_loss: 6.5210\n",
      "Epoch 7/40\n",
      "12480/26673 [=============>................] - ETA: 51s - loss: 4.377426673/26673 [==============================] - 102s 4ms/step - loss: 4.4323 - val_loss: 6.6638\n",
      "Epoch 00007: early stopping\n",
      "elapsed time:  11.904992830753326 minutes\n"
     ]
    }
   ],
   "source": [
    "def build_model(model, address = None):\n",
    "    \"\"\"\n",
    "    Fit the model if the model checkpoint does not exist or else\n",
    "    load it from that address.\n",
    "    \"\"\"\n",
    "    if address is not None or not os.path.isfile(address):\n",
    "        stop = EarlyStopping(monitor = 'val_loss', min_delta = 0, \n",
    "                             patience = 5, verbose = 1, mode = 'auto')\n",
    "        save = ModelCheckpoint(address, monitor = 'val_loss', \n",
    "                               verbose = 0, save_best_only = True)\n",
    "        callbacks = [stop, save]\n",
    "\n",
    "        start = time()\n",
    "        history = model.fit(X, Y, batch_size = batch_size, \n",
    "                            epochs = epochs, verbose = 1,\n",
    "                            validation_split = validation_split,\n",
    "                            callbacks = callbacks)\n",
    "        elapse = time() - start\n",
    "        print('elapsed time: ', elapsed(elapse))\n",
    "        model_info = {'history': history, 'elapse': elapse, 'model': model}\n",
    "    else:\n",
    "        model = load_model(address)\n",
    "        model_info = {'model': model}\n",
    "\n",
    "    return model_info\n",
    "  \n",
    "\n",
    "epochs = 40\n",
    "batch_size = 32\n",
    "validation_split = 0.2\n",
    "address1 = 'lstm_weights1.hdf5'\n",
    "print('model checkpoint address: ', address1)\n",
    "model_info1 = build_model(model1, address1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to test the trained model, one can compare the model's predicted word against what the actual word sequence are in the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 907,
     "status": "ok",
     "timestamp": 1525625920545,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "eSq5M9d3Jlwa",
    "outputId": "b469c185-8522-423d-b7d8-37a0285e94b9"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Actual words: they paid to been unseemly <unk> certainly never to and \n",
      "Predicted words: the paid to to <unk> the are been to and \n"
     ]
    }
   ],
   "source": [
    "def check_prediction(model, num_predict):\n",
    "    true_print_out = 'Actual words: '\n",
    "    pred_print_out = 'Predicted words: '\n",
    "    for i in range(num_predict):\n",
    "        x = X[i]\n",
    "        prediction = model.predict(x[np.newaxis, :], verbose = 0)\n",
    "        index = np.argmax(prediction)\n",
    "        true_print_out += index2word[y[i]] + ' '\n",
    "        pred_print_out += index2word[index] + ' '\n",
    "\n",
    "    print(true_print_out)\n",
    "    print(pred_print_out)\n",
    "\n",
    "\n",
    "num_predict = 10\n",
    "model = model_info1['model']\n",
    "check_prediction(model, num_predict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Despite not being a perfect match, we can see that there is still a rough correspondence between the predicted token versus the actual one. To train the network which can perform better at language modeling requires a much larger corpus and more training and optimization. But, hopefully, this post has given us a basic understanding on the general process of building a language model.\n",
    "\n",
    "The following section lists out some ideas worth trying:\n",
    "\n",
    "- Sentence-wise model. When generating the sub-sequences for the language model, we could perform a sentence detection first by splitting the documents into sentences then pad each sentence to a fixed length (length can be determined by the longest sentence length).\n",
    "- Simplify vocabulary. Perform further text preprocessing such as removing stop words or stemming.\n",
    "- Hyperparameter tuning. e.g. size of embedding layer, LSTM layer, include dropout, etc. See if a different hyperparameter setting leads to a better model. Although, if we wish to build a stacked LSTM layer using keras then some changes to the code above is required, elaborated below:\n",
    "\n",
    "When stacking LSTM layers, rather than using the last hidden state as the output to the next layer (e.g. the Dense layer) all the hidden states will be used as an input to the subsequent LSTM layer. In other words, a stacked LSTM will have an output for every time step as oppose to 1 output across all time steps. The diagram depicts the pattern for what 2 layers would look like:\n",
    "\n",
    "<img src=\"img/2_layer_rnn.png\" width=\"60%\" height=\"60%\">\n",
    "\n",
    "The next couple of code chunks illustrates the difference. So suppose we have two input example (batch size of 2) both having a fixed time step of 3. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dimension:  (2, 4)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 0.06472953,  0.07302882, -0.0113507 , -0.05956742],\n",
       "       [ 0.08739619,  0.09793343, -0.01274136, -0.07241688]],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keras.models import Model\n",
    "from keras.layers import Input\n",
    "\n",
    "# using keras' functional API\n",
    "seq_len = 3\n",
    "n_features = 1\n",
    "hidden_size = 4\n",
    "data = np.array([[0.1, 0.2, 0.3], [0.15, 0.45, 0.25]]).reshape(-1, seq_len, n_features)\n",
    "\n",
    "inputs = Input(shape = (seq_len, n_features))\n",
    "lstm = LSTM(hidden_size)(inputs)\n",
    "model = Model(inputs = inputs, outputs = lstm)\n",
    "prediction = model.predict(data)\n",
    "print('dimension: ', prediction.shape)\n",
    "prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looking at the output by the LSTM layer, we can see that it outputs a single (the last) hidden state for the input sequence. If we're to build a stacked LSTM layer, then we would need to access the hidden state output for each time step. This can be done by setting `return_sequences` argument to `True` when defining our LSTM layer, as shown below:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dimension:  (2, 3, 4)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[[-0.01126682, -0.0064276 ,  0.00249478, -0.00055401],\n",
       "        [-0.03018942, -0.01813748,  0.00614582, -0.00209547],\n",
       "        [-0.05421135, -0.03431558,  0.01020968, -0.0047514 ]],\n",
       "\n",
       "       [[-0.01700315, -0.00955516,  0.00374703, -0.00082847],\n",
       "        [-0.06392946, -0.03527492,  0.01297658, -0.00386212],\n",
       "        [-0.06742323, -0.04802553,  0.01229313, -0.00726138]]],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs = Input(shape = (seq_len, n_features))\n",
    "lstm = LSTM(hidden_size, return_sequences = True)(inputs)\n",
    "model = Model(inputs = inputs, outputs = lstm)\n",
    "\n",
    "# three-dimensional output, so apart from the batch size and\n",
    "# lstm hidden layer's size there's also an additional dimension\n",
    "# for the number of time steps\n",
    "prediction = model.predict(data)\n",
    "print('dimension: ', prediction.shape)\n",
    "prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When stacking LSTM layers, we should specify `return_sequences = True` so that the next LSTM layer has access to all the previous layer's hidden states."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "base_uri": "https://localhost:8080/",
     "height": 289
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1355,
     "status": "ok",
     "timestamp": 1525625921925,
     "user": {
      "displayName": "Ming-Yu Liu",
      "photoUrl": "https://lh3.googleusercontent.com/a/default-user=s128",
      "userId": "113235319461992470380"
     },
     "user_tz": 420
    },
    "id": "MQebgUXeJQIu",
    "outputId": "bb37433a-dfc3-4584-8beb-26cd12a63286"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_2 (Embedding)      (None, 40, 50)            254500    \n",
      "_________________________________________________________________\n",
      "lstm_4 (LSTM)                (None, 40, 256)           314368    \n",
      "_________________________________________________________________\n",
      "lstm_5 (LSTM)                (None, 256)               525312    \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 5090)              1308130   \n",
      "=================================================================\n",
      "Total params: 2,402,310\n",
      "Trainable params: 2,402,310\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# two-layer LSTM example, this is not trained\n",
    "model2 = Sequential()\n",
    "model2.add(Embedding(num_classes, embedding_size, input_length = maxlen))\n",
    "model2.add(LSTM(256, return_sequences = True))\n",
    "model2.add(LSTM(256))\n",
    "model2.add(Dense(num_classes, activation = 'softmax'))\n",
    "model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam')\n",
    "print(model2.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reference"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "id": "gOB_-PK7oQ7j"
   },
   "source": [
    "- [Blog: Stacked Long Short-Term Memory Networks](https://machinelearningmastery.com/stacked-long-short-term-memory-networks/)\n",
    "- [Blog: Text Generation With LSTM Recurrent Neural Networks in Python with Keras](https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/)\n",
    "- [Blog: How to Develop a Word-Level Neural Language Model and Use it to Generate Text](https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/)\n",
    "- [Blog: Keras LSTM tutorial – How to easily build a powerful deep learning language model](http://adventuresinmachinelearning.com/keras-lstm-tutorial/)\n",
    "- [Blog: Understand the Difference Between Return Sequences and Return States for LSTMs in Keras](https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/)\n",
    "- [Paper: S. Merity, N. Keskar, R. Socher - An Analysis of Neural Language Modeling at Multiple Scales (2018)](https://arxiv.org/abs/1803.08240)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "default_view": {},
   "name": "keras_rnn.ipynb",
   "provenance": [],
   "version": "0.3.2",
   "views": {}
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "221px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}