{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from theano.sandbox import cuda\n",
    "cuda.use('gpu1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import utils; reload(utils)\n",
    "from utils import *\n",
    "from __future__ import division, print_function"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We're going to download the collected works of Nietzsche to use as our data for this class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "corpus length: 600901\n"
     ]
    }
   ],
   "source": [
    "path = get_file('nietzsche.txt', origin=\"https://s3.amazonaws.com/text-datasets/nietzsche.txt\")\n",
    "text = open(path).read()\n",
    "print('corpus length:', len(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total chars: 86\n"
     ]
    }
   ],
   "source": [
    "chars = sorted(list(set(text)))\n",
    "vocab_size = len(chars)+1\n",
    "print('total chars:', vocab_size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Sometimes it's useful to have a zero value in the dataset, e.g. for padding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "chars.insert(0, \"\\0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n !\"\\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''.join(chars[1:-6])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Map from chars to indices and back again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "char_indices = dict((c, i) for i, c in enumerate(chars))\n",
    "indices_char = dict((i, c) for i, c in enumerate(chars))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "idx = [char_indices[c] for c in text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'PREFACE\\n\\n\\nSUPPOSING that Truth is a woman--what then? Is there not gro'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''.join(indices_char[i] for i in idx[:70])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## 3 char model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Create inputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "cs=3\n",
    "c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]\n",
    "c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]\n",
    "c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]\n",
    "c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "x1 = np.stack(c1_dat[:-2])\n",
    "x2 = np.stack(c2_dat[:-2])\n",
    "x3 = np.stack(c3_dat[:-2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "y = np.stack(c4_dat[:-2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The first 4 inputs and outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x1[:4], x2[:4], x3[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([30, 29,  1, 40])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((200297,), (200297,))"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x1.shape, y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The number of latent factors to create (i.e. the size of the embedding matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_fac = 42"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Create inputs and embedding outputs for each of our 3 character inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def embedding_input(name, n_in, n_out):\n",
    "    inp = Input(shape=(1,), dtype='int64', name=name)\n",
    "    emb = Embedding(n_in, n_out, input_length=1)(inp)\n",
    "    return inp, Flatten()(emb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c1_in, c1 = embedding_input('c1', vocab_size, n_fac)\n",
    "c2_in, c2 = embedding_input('c2', vocab_size, n_fac)\n",
    "c3_in, c3 = embedding_input('c3', vocab_size, n_fac)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Create and train model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Pick a size for our hidden state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_hidden = 256"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is the 'green arrow' from our diagram - the layer operation from input to hidden."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dense_in = Dense(n_hidden, activation='relu')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our first hidden activation is simply this function applied to the result of the embedding of the first character."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c1_hidden = dense_in(c1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is the 'orange arrow' from our diagram - the layer operation from hidden to hidden."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dense_hidden = Dense(n_hidden, activation='tanh')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our second and third hidden activations sum up the previous hidden state (after applying dense_hidden) to the new input state."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c2_dense = dense_in(c2)\n",
    "hidden_2 = dense_hidden(c1_hidden)\n",
    "c2_hidden = merge([c2_dense, hidden_2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c3_dense = dense_in(c3)\n",
    "hidden_3 = dense_hidden(c2_hidden)\n",
    "c3_hidden = merge([c3_dense, hidden_3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is the 'blue arrow' from our diagram - the layer operation from hidden to output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dense_out = Dense(vocab_size, activation='softmax')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The third hidden state is the input to our output layer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c4_out = dense_out(c3_hidden)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Model([c1_in, c2_in, c3_in], c4_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr=0.000001"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "200297/200297 [==============================] - 4s - loss: 4.2459     \n",
      "Epoch 2/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.6853     \n",
      "Epoch 3/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.3444     \n",
      "Epoch 4/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.1719     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7ff3778ec510>"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr=0.01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.0829     \n",
      "Epoch 2/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.0314     \n",
      "Epoch 3/4\n",
      "200297/200297 [==============================] - 4s - loss: 2.9973     \n",
      "Epoch 4/4\n",
      "200297/200297 [==============================] - 5s - loss: 2.9722     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7ff3778ecdd0>"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr.set_value(0.000001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "200297/200297 [==============================] - 5s - loss: 4.4125     \n",
      "Epoch 2/4\n",
      "200297/200297 [==============================] - 5s - loss: 4.2799     \n",
      "Epoch 3/4\n",
      "200297/200297 [==============================] - 5s - loss: 4.0000     \n",
      "Epoch 4/4\n",
      "200297/200297 [==============================] - 5s - loss: 3.5942     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7ff3788d4d10>"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr.set_value(0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "200297/200297 [==============================] - 5s - loss: 7.8651     \n",
      "Epoch 2/4\n",
      "200297/200297 [==============================] - 5s - loss: 5.1607     \n",
      "Epoch 3/4\n",
      "200297/200297 [==============================] - 5s - loss: 4.7043     \n",
      "Epoch 4/4\n",
      "200297/200297 [==============================] - 5s - loss: 4.7026     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7ff37ba08a50>"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Test model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_next(inp):\n",
    "    idxs = [char_indices[c] for c in inp]\n",
    "    arrs = [np.array(i)[np.newaxis] for i in idxs]\n",
    "    p = model.predict(arrs)\n",
    "    i = np.argmax(p)\n",
    "    return chars[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'l'"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next('phi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'e'"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next(' th')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'d'"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next(' an')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Our first RNN!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Create inputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is the size of our unrolled RNN."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "cs=8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to out model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]\n",
    "            for n in range(cs)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Then create a list of the next character in each of these series. This will be the labels for our model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "xs = [np.stack(c[:-2]) for c in c_in_dat]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8, (75110,))"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(xs), xs[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "y = np.stack(c_out_dat[:-2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "So each column below is one series of 8 characters from the text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([40,  1, 33,  2, 72, 67, 73,  2]),\n",
       " array([42,  1, 38, 44,  2,  9, 61, 73]),\n",
       " array([29, 43, 31, 71, 54,  9, 58, 61]),\n",
       " array([30, 45,  2, 74,  2, 76, 67, 58]),\n",
       " array([25, 40, 73, 73, 76, 61, 24, 71]),\n",
       " array([27, 40, 61, 61, 68, 54,  2, 58]),\n",
       " array([29, 39, 54,  2, 66, 73, 33,  2]),\n",
       " array([ 1, 43, 73, 62, 54,  2, 72, 67])]"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[xs[n][:cs] for n in range(cs)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "...and this is the next character after each sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1, 33,  2, 72, 67, 73,  2, 68])"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[:cs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_fac = 42"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Create and train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def embedding_input(name, n_in, n_out):\n",
    "    inp = Input(shape=(1,), dtype='int64', name=name+'_in')\n",
    "    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)\n",
    "    return inp, Flatten()(emb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_hidden = 256"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dense_in = Dense(n_hidden, activation='relu')\n",
    "dense_hidden = Dense(n_hidden, activation='relu', init='identity')\n",
    "dense_out = Dense(vocab_size, activation='softmax')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The first character of each sequence goes through dense_in(), to create our first hidden activations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "hidden = dense_in(c_ins[0][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Then for each successive layer we combine the output of dense_in() on the next character with the output of dense_hidden() on the current hidden state, to create the new hidden state."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "for i in range(1,cs):\n",
    "    c_dense = dense_in(c_ins[i][1])\n",
    "    hidden = dense_hidden(hidden)\n",
    "    hidden = merge([c_dense, hidden])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Putting the final hidden state through dense_out() gives us our output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "c_out = dense_out(hidden)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "So now we can create our model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Model([c[0] for c in c_ins], c_out)\n",
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/12\n",
      "75110/75110 [==============================] - 3s - loss: 2.5385     \n",
      "Epoch 2/12\n",
      "75110/75110 [==============================] - 3s - loss: 2.2645     \n",
      "Epoch 3/12\n",
      "75110/75110 [==============================] - 3s - loss: 2.1596     \n",
      "Epoch 4/12\n",
      "75110/75110 [==============================] - 3s - loss: 2.0888     \n",
      "Epoch 5/12\n",
      "75110/75110 [==============================] - 3s - loss: 2.0355     \n",
      "Epoch 6/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.9897     \n",
      "Epoch 7/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.9506     \n",
      "Epoch 8/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.9149     \n",
      "Epoch 9/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.8840     \n",
      "Epoch 10/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.8546     \n",
      "Epoch 11/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.8293     \n",
      "Epoch 12/12\n",
      "75110/75110 [==============================] - 3s - loss: 1.8050     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f25579a80d0>"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(xs, y, batch_size=64, nb_epoch=12)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Test model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_next(inp):\n",
    "    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]\n",
    "    p = model.predict(idxs)\n",
    "    return chars[np.argmax(p)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'e'"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next('for thos')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 432,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t'"
      ]
     },
     "execution_count": 432,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next('part of ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 433,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'n'"
      ]
     },
     "execution_count": 433,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next('queens a')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "heading_collapsed": true
   },
   "source": [
    "## Our first RNN with keras!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is nearly exactly equivalent to the RNN we built ourselves in the previous section."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model=Sequential([\n",
    "        Embedding(vocab_size, n_fac, input_length=cs),\n",
    "        SimpleRNN(n_hidden, activation='relu', inner_init='identity'),\n",
    "        Dense(vocab_size, activation='softmax')\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "____________________________________________________________________________________________________\n",
      "Layer (type)                     Output Shape          Param #     Connected to                     \n",
      "====================================================================================================\n",
      "embedding_5 (Embedding)          (None, 8, 42)         3612        embedding_input_2[0][0]          \n",
      "____________________________________________________________________________________________________\n",
      "simplernn_2 (SimpleRNN)          (None, 256)           76544       embedding_5[0][0]                \n",
      "____________________________________________________________________________________________________\n",
      "dense_2 (Dense)                  (None, 86)            22102       simplernn_2[0][0]                \n",
      "====================================================================================================\n",
      "Total params: 102258\n",
      "____________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/8\n",
      "75110/75110 [==============================] - 3s - loss: 2.7939     \n",
      "Epoch 2/8\n",
      "75110/75110 [==============================] - 3s - loss: 2.2970     \n",
      "Epoch 3/8\n",
      "75110/75110 [==============================] - 3s - loss: 2.0814     \n",
      "Epoch 4/8\n",
      "75110/75110 [==============================] - 3s - loss: 1.9416     \n",
      "Epoch 5/8\n",
      "75110/75110 [==============================] - 3s - loss: 1.8406     \n",
      "Epoch 6/8\n",
      "75110/75110 [==============================] - 3s - loss: 1.7625     \n",
      "Epoch 7/8\n",
      "75110/75110 [==============================] - 3s - loss: 1.6960     \n",
      "Epoch 8/8\n",
      "75110/75110 [==============================] - 3s - loss: 1.6421     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa18f2c0890>"
      ]
     },
     "execution_count": 217,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(np.concatenate(xs,axis=1), y, batch_size=64, nb_epoch=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_next_keras(inp):\n",
    "    idxs = [char_indices[c] for c in inp]\n",
    "    arrs = np.array(idxs)[np.newaxis,:]\n",
    "    p = model.predict(arrs)[0]\n",
    "    return chars[np.argmax(p)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t'"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next_keras('this is ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t'"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next_keras('part of ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'n'"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_next_keras('queens a')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Returning sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Create inputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)\n",
    "\n",
    "Here, c_out_dat is identical to c_in_dat, but moved across 1 character."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "#c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]\n",
    "#            for n in range(cs)]\n",
    "c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]\n",
    "            for n in range(cs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "ys = [np.stack(c[:-2]) for c in c_out_dat]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Reading down each column shows one set of inputs and outputs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([40,  1, 33,  2, 72, 67, 73,  2]),\n",
       " array([42,  1, 38, 44,  2,  9, 61, 73]),\n",
       " array([29, 43, 31, 71, 54,  9, 58, 61]),\n",
       " array([30, 45,  2, 74,  2, 76, 67, 58]),\n",
       " array([25, 40, 73, 73, 76, 61, 24, 71]),\n",
       " array([27, 40, 61, 61, 68, 54,  2, 58]),\n",
       " array([29, 39, 54,  2, 66, 73, 33,  2]),\n",
       " array([ 1, 43, 73, 62, 54,  2, 72, 67])]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[xs[n][:cs] for n in range(cs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([42,  1, 38, 44,  2,  9, 61, 73]),\n",
       " array([29, 43, 31, 71, 54,  9, 58, 61]),\n",
       " array([30, 45,  2, 74,  2, 76, 67, 58]),\n",
       " array([25, 40, 73, 73, 76, 61, 24, 71]),\n",
       " array([27, 40, 61, 61, 68, 54,  2, 58]),\n",
       " array([29, 39, 54,  2, 66, 73, 33,  2]),\n",
       " array([ 1, 43, 73, 62, 54,  2, 72, 67]),\n",
       " array([ 1, 33,  2, 72, 67, 73,  2, 68])]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[ys[n][:cs] for n in range(cs)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Create and train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dense_in = Dense(n_hidden, activation='relu')\n",
    "dense_hidden = Dense(n_hidden, activation='relu', init='identity')\n",
    "dense_out = Dense(vocab_size, activation='softmax', name='output')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We're going to pass a vector of all zeros as our starting point - here's our input layers for that:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "inp1 = Input(shape=(n_fac,), name='zeros')\n",
    "hidden = dense_in(inp1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "outs = []\n",
    "\n",
    "for i in range(cs):\n",
    "    c_dense = dense_in(c_ins[i][1])\n",
    "    hidden = dense_hidden(hidden)\n",
    "    hidden = merge([c_dense, hidden], mode='sum')\n",
    "    # every layer now has an output\n",
    "    outs.append(dense_out(hidden))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Model([inp1] + [c[0] for c in c_ins], outs)\n",
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75110, 42)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))\n",
    "zeros.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 394,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO (theano.gof.compilelock): Refreshing lock /home/jhoward/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/lock_dir/lock\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/12\n",
      "75110/75110 [==============================] - 7s - loss: 20.0841 - output_loss_1: 2.7123 - output_loss_2: 2.5681 - output_loss_3: 2.5143 - output_loss_4: 2.4739 - output_loss_5: 2.4675 - output_loss_6: 2.4442 - output_loss_7: 2.4627 - output_loss_8: 2.4410     \n",
      "Epoch 2/12\n",
      "75110/75110 [==============================] - 7s - loss: 17.8335 - output_loss_1: 2.5124 - output_loss_2: 2.3529 - output_loss_3: 2.2368 - output_loss_4: 2.1686 - output_loss_5: 2.1540 - output_loss_6: 2.1337 - output_loss_7: 2.1520 - output_loss_8: 2.1232     \n",
      "Epoch 3/12\n",
      "75110/75110 [==============================] - 7s - loss: 17.2340 - output_loss_1: 2.4967 - output_loss_2: 2.3306 - output_loss_3: 2.1766 - output_loss_4: 2.0814 - output_loss_5: 2.0529 - output_loss_6: 2.0291 - output_loss_7: 2.0475 - output_loss_8: 2.0192     \n",
      "Epoch 4/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.8647 - output_loss_1: 2.4896 - output_loss_2: 2.3218 - output_loss_3: 2.1437 - output_loss_4: 2.0278 - output_loss_5: 1.9901 - output_loss_6: 1.9600 - output_loss_7: 1.9768 - output_loss_8: 1.9549     \n",
      "Epoch 5/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.6200 - output_loss_1: 2.4858 - output_loss_2: 2.3158 - output_loss_3: 2.1287 - output_loss_4: 1.9941 - output_loss_5: 1.9481 - output_loss_6: 1.9151 - output_loss_7: 1.9276 - output_loss_8: 1.9047     \n",
      "Epoch 6/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.4396 - output_loss_1: 2.4835 - output_loss_2: 2.3121 - output_loss_3: 2.1148 - output_loss_4: 1.9705 - output_loss_5: 1.9188 - output_loss_6: 1.8774 - output_loss_7: 1.8937 - output_loss_8: 1.8689     \n",
      "Epoch 7/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.3016 - output_loss_1: 2.4825 - output_loss_2: 2.3090 - output_loss_3: 2.1054 - output_loss_4: 1.9523 - output_loss_5: 1.8957 - output_loss_6: 1.8514 - output_loss_7: 1.8639 - output_loss_8: 1.8414     \n",
      "Epoch 8/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.1862 - output_loss_1: 2.4807 - output_loss_2: 2.3076 - output_loss_3: 2.0974 - output_loss_4: 1.9391 - output_loss_5: 1.8757 - output_loss_6: 1.8284 - output_loss_7: 1.8413 - output_loss_8: 1.8161     \n",
      "Epoch 9/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.0887 - output_loss_1: 2.4802 - output_loss_2: 2.3055 - output_loss_3: 2.0913 - output_loss_4: 1.9275 - output_loss_5: 1.8603 - output_loss_6: 1.8101 - output_loss_7: 1.8200 - output_loss_8: 1.7938     \n",
      "Epoch 10/12\n",
      "75110/75110 [==============================] - 7s - loss: 16.0118 - output_loss_1: 2.4790 - output_loss_2: 2.3038 - output_loss_3: 2.0882 - output_loss_4: 1.9172 - output_loss_5: 1.8458 - output_loss_6: 1.7946 - output_loss_7: 1.8049 - output_loss_8: 1.7782     \n",
      "Epoch 11/12\n",
      "75110/75110 [==============================] - 7s - loss: 15.9393 - output_loss_1: 2.4784 - output_loss_2: 2.3027 - output_loss_3: 2.0827 - output_loss_4: 1.9095 - output_loss_5: 1.8341 - output_loss_6: 1.7803 - output_loss_7: 1.7885 - output_loss_8: 1.7631     \n",
      "Epoch 12/12\n",
      "75110/75110 [==============================] - 7s - loss: 15.8785 - output_loss_1: 2.4773 - output_loss_2: 2.3021 - output_loss_3: 2.0788 - output_loss_4: 1.9015 - output_loss_5: 1.8239 - output_loss_6: 1.7680 - output_loss_7: 1.7770 - output_loss_8: 1.7498     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa168d005d0>"
      ]
     },
     "execution_count": 394,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Test model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 395,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_nexts(inp):\n",
    "    idxs = [char_indices[c] for c in inp]\n",
    "    arrs = [np.array(i)[np.newaxis] for i in idxs]\n",
    "    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)\n",
    "    print(list(inp))\n",
    "    return [chars[np.argmax(o)] for o in p]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 396,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', 't', 'h', 'i', 's', ' ', 'i', 's']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['t', 'h', 'e', 't', ' ', 'c', 's', ' ']"
      ]
     },
     "execution_count": 396,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nexts(' this is')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 397,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['t', 'o', 'r', 't', ' ', 'o', 'f', ' ']"
      ]
     },
     "execution_count": 397,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nexts(' part of')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "heading_collapsed": true
   },
   "source": [
    "### Sequence model with keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(256, 42, 8, 86)"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_hidden, n_fac, cs, vocab_size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model=Sequential([\n",
    "        Embedding(vocab_size, n_fac, input_length=cs),\n",
    "        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),\n",
    "        TimeDistributed(Dense(vocab_size, activation='softmax')),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "____________________________________________________________________________________________________\n",
      "Layer (type)                     Output Shape          Param #     Connected to                     \n",
      "====================================================================================================\n",
      "embedding_6 (Embedding)          (None, 8, 42)         3612        embedding_input_3[0][0]          \n",
      "____________________________________________________________________________________________________\n",
      "simplernn_3 (SimpleRNN)          (None, 8, 256)        76544       embedding_6[0][0]                \n",
      "____________________________________________________________________________________________________\n",
      "timedistributed_1 (TimeDistribut (None, 8, 86)         22102       simplernn_3[0][0]                \n",
      "====================================================================================================\n",
      "Total params: 102258\n",
      "____________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(75110,)"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xs[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "x_rnn=np.stack(np.squeeze(xs), axis=1)\n",
    "y_rnn=np.atleast_3d(np.stack(ys, axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((75110, 8), (75110, 8, 1))"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_rnn.shape, y_rnn.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/8\n",
      "75110/75110 [==============================] - 4s - loss: 2.4284     \n",
      "Epoch 2/8\n",
      "75110/75110 [==============================] - 4s - loss: 2.0006     \n",
      "Epoch 3/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.8863     \n",
      "Epoch 4/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.8264     \n",
      "Epoch 5/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7882     \n",
      "Epoch 6/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7613     \n",
      "Epoch 7/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7417     \n",
      "Epoch 8/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7258     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f82761cc990>"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_nexts_keras(inp):\n",
    "    idxs = [char_indices[c] for c in inp]\n",
    "    arr = np.array(idxs)[np.newaxis,:]\n",
    "    p = model.predict(arr)[0]\n",
    "    print(list(inp))\n",
    "    return [chars[np.argmax(o)] for o in p]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', 't', 'h', 'i', 's', ' ', 'i', 's']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['t', 'h', 'e', 's', ' ', 'i', 's', ' ']"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nexts_keras(' this is')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "heading_collapsed": true
   },
   "source": [
    "### One-hot sequence model with keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is the keras version of the theano model that we're about to create."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model=Sequential([\n",
    "        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),\n",
    "                  activation='relu', inner_init='identity'),\n",
    "        TimeDistributed(Dense(vocab_size, activation='softmax')),\n",
    "    ])\n",
    "model.compile(loss='categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((75110, 8, 86), (75110, 8, 86))"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oh_ys = [to_categorical(o, vocab_size) for o in ys]\n",
    "oh_y_rnn=np.stack(oh_ys, axis=1)\n",
    "\n",
    "oh_xs = [to_categorical(o, vocab_size) for o in xs]\n",
    "oh_x_rnn=np.stack(oh_xs, axis=1)\n",
    "\n",
    "oh_x_rnn.shape, oh_y_rnn.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/8\n",
      "75110/75110 [==============================] - 4s - loss: 2.4383     \n",
      "Epoch 2/8\n",
      "75110/75110 [==============================] - 4s - loss: 2.0318     \n",
      "Epoch 3/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.9195     \n",
      "Epoch 4/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.8553     \n",
      "Epoch 5/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.8133     \n",
      "Epoch 6/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7829     \n",
      "Epoch 7/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7593     \n",
      "Epoch 8/8\n",
      "75110/75110 [==============================] - 4s - loss: 1.7410     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f8210725c90>"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_nexts_oh(inp):\n",
    "    idxs = np.array([char_indices[c] for c in inp])\n",
    "    arr = to_categorical(idxs, vocab_size)\n",
    "    p = model.predict(arr[np.newaxis,:])[0]\n",
    "    print(list(inp))\n",
    "    return [chars[np.argmax(o)] for o in p]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', 't', 'h', 'i', 's', ' ', 'i', 's']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['t', 'h', 'e', 's', ' ', 'i', 's', ' ']"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nexts_oh(' this is')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "heading_collapsed": true
   },
   "source": [
    "## Stateful model with keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "bs=64"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "A stateful model is easy to create (just add \"stateful=True\") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.\n",
    "\n",
    "When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model=Sequential([\n",
    "        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),\n",
    "        BatchNormalization(),\n",
    "        LSTM(n_hidden, return_sequences=True, stateful=True),\n",
    "        TimeDistributed(Dense(vocab_size, activation='softmax')),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "mx = len(x_rnn)//bs*bs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO (theano.gof.compilelock): Refreshing lock /home/jhoward/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/lock_dir/lock\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "75072/75072 [==============================] - 13s - loss: 2.2051    \n",
      "Epoch 2/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.9621    \n",
      "Epoch 3/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.8893    \n",
      "Epoch 4/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.8453    \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa16f1d2690>"
      ]
     },
     "execution_count": 341,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 342,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr=1e-4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 343,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.8132    \n",
      "Epoch 2/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7877    \n",
      "Epoch 3/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7663    \n",
      "Epoch 4/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7475    \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa1773b8c10>"
      ]
     },
     "execution_count": 343,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 344,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7308    \n",
      "Epoch 2/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7155    \n",
      "Epoch 3/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.7014    \n",
      "Epoch 4/4\n",
      "75072/75072 [==============================] - 13s - loss: 1.6881    \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fa1773b8d50>"
      ]
     },
     "execution_count": 344,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Theano RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "n_input = vocab_size\n",
    "n_output = vocab_size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Using raw theano, we have to create our weight matrices and bias vectors ourselves - here are the functions we'll use to do so (using glorot initialization).\n",
    "\n",
    "The return values are wrapped in `shared()`, which is how we tell theano that it can manage this data (copying it to and from the GPU as necessary)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def init_wgts(rows, cols): \n",
    "    scale = math.sqrt(2/rows)\n",
    "    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))\n",
    "def init_bias(rows): \n",
    "    return shared(np.zeros(rows, dtype=np.float32))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We return the weights and biases together as a tuple. For the hidden weights, we'll use an identity initialization (as recommended by [Hinton](https://arxiv.org/abs/1504.00941).)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def wgts_and_bias(n_in, n_out): \n",
    "    return init_wgts(n_in, n_out), init_bias(n_out)\n",
    "def id_and_bias(n): \n",
    "    return shared(np.eye(n, dtype=np.float32)), init_bias(n)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Theano doesn't actually do any computations until we explicitly compile and evaluate the function (at which point it'll be turned into CUDA code and sent off to the GPU). So our job is to describe the computations that we'll want theano to do - the first step is to tell theano what inputs we'll be providing to our computation:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "t_inp = T.matrix('inp')\n",
    "t_outp = T.matrix('outp')\n",
    "t_h0 = T.vector('h0')\n",
    "lr = T.scalar('lr')\n",
    "\n",
    "all_args = [t_h0, t_inp, t_outp, lr]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Now we're ready to create our intial weight matrices."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "W_h = id_and_bias(n_hidden)\n",
    "W_x = wgts_and_bias(n_input, n_hidden)\n",
    "W_y = wgts_and_bias(n_hidden, n_output)\n",
    "w_all = list(chain.from_iterable([W_h, W_x, W_y]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Theano handles looping by using the [GPU scan](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html) operation. We have to tell theano what to do at each step through the scan - this is the function we'll use, which does a single forward pass for one character:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):\n",
    "    # Calculate the hidden activations\n",
    "    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)\n",
    "    # Calculate the output activations\n",
    "    y = nnet.softmax(T.dot(h, W_y) + b_y)\n",
    "    # Return both (the 'Flatten()' is to work around a theano bug)\n",
    "    return h, T.flatten(y, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Now we can provide everything necessary for the scan operation, so we can setup that up - we have to pass in the function to call at each step, the sequence to step through, the initial values of the outputs, and any other arguments to pass to the step function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "[v_h, v_y], _ = theano.scan(step, sequences=t_inp, \n",
    "                            outputs_info=[t_h0, None], non_sequences=w_all)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We can now calculate our loss function, and *all* of our gradients, with just a couple of lines of code!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "error = nnet.categorical_crossentropy(v_y, t_outp).sum()\n",
    "g_all = T.grad(error, w_all)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We even have to show theano how to do SGD - so we set up this dictionary of updates to complete after every forward pass, which apply to standard SGD update rule to every weight."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def upd_dict(wgts, grads, lr): \n",
    "    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})\n",
    "\n",
    "upd = upd_dict(w_all, g_all, lr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We're finally ready to compile the function!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((75110, 8, 86), (75110, 8, 86))"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = oh_x_rnn\n",
    "Y = oh_y_rnn\n",
    "X.shape, Y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error:25.196\n",
      "Error:21.489\n",
      "Error:20.900\n",
      "Error:19.913\n",
      "Error:18.816\n",
      "Error:19.202\n",
      "Error:19.066\n",
      "Error:18.473\n",
      "Error:17.942\n",
      "Error:18.251\n",
      "Error:17.489\n",
      "Error:17.570\n",
      "Error:18.371\n",
      "Error:17.331\n",
      "Error:16.807\n",
      "Error:17.681\n",
      "Error:17.401\n",
      "Error:17.136\n",
      "Error:16.830\n",
      "Error:16.651\n",
      "Error:16.518\n",
      "Error:16.430\n",
      "Error:16.687\n",
      "Error:16.161\n",
      "Error:16.775\n",
      "Error:16.566\n",
      "Error:16.053\n",
      "Error:16.296\n",
      "Error:16.240\n",
      "Error:16.454\n",
      "Error:16.699\n",
      "Error:16.396\n",
      "Error:16.644\n",
      "Error:16.328\n",
      "Error:15.990\n",
      "Error:16.644\n",
      "Error:15.981\n",
      "Error:16.359\n",
      "Error:16.042\n",
      "Error:16.326\n",
      "Error:15.361\n",
      "Error:15.690\n",
      "Error:15.742\n",
      "Error:16.048\n",
      "Error:15.955\n",
      "Error:15.866\n",
      "Error:15.571\n",
      "Error:16.069\n",
      "Error:15.997\n",
      "Error:16.030\n",
      "Error:15.230\n",
      "Error:15.612\n",
      "Error:14.918\n",
      "Error:14.821\n",
      "Error:15.580\n",
      "Error:15.380\n",
      "Error:14.650\n",
      "Error:15.499\n",
      "Error:15.110\n",
      "Error:14.972\n",
      "Error:15.034\n",
      "Error:15.427\n",
      "Error:15.236\n",
      "Error:15.037\n",
      "Error:14.768\n",
      "Error:14.781\n",
      "Error:14.329\n",
      "Error:14.726\n",
      "Error:15.229\n",
      "Error:14.809\n",
      "Error:15.144\n",
      "Error:14.755\n",
      "Error:14.440\n",
      "Error:14.431\n",
      "Error:14.464\n"
     ]
    }
   ],
   "source": [
    "err=0.0; l_rate=0.01\n",
    "for i in range(len(X)): \n",
    "    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)\n",
    "    if i % 1000 == 999: \n",
    "        print (\"Error:{:.3f}\".format(err/1000))\n",
    "        err=0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "act = np.argmax(X[6], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['t', 'h', 'e', 'n', '?', ' ', 'I', 's']"
      ]
     },
     "execution_count": 338,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[indices_char[o] for o in act]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['h', 'e', ' ', ' ', ' ', 'T', 'n', ' ']"
      ]
     },
     "execution_count": 339,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[indices_char[o] for o in pred]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Pure python RNN!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Set up basic functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Now we're going to try to repeat the above theano RNN, using just pure python (and numpy). Which means, we have to do everything ourselves, including defining the basic functions of a neural net! Below are all of the definitions, along with tests to check that they give the same answers as theano. The functions ending in `_d` are the derivatives of each function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def sigmoid(x): return 1/(1+np.exp(-x))\n",
    "def sigmoid_d(x): \n",
    "    output = sigmoid(x)\n",
    "    return output*(1-output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def relu(x): return np.maximum(0., x)\n",
    "def relu_d(x): return (x > 0.)*1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 3.,  0.]), array([ 1.,  0.]))"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def dist(a,b): return pow(a-b,2)\n",
    "def dist_d(a,b): return 2*(a-b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import pdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "eps = 1e-7\n",
    "def x_entropy(pred, actual): \n",
    "    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))\n",
    "def x_entropy_d(pred, actual): return -actual/pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def softmax(x): return np.exp(x)/np.exp(x).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def softmax_d(x):\n",
    "    sm = softmax(x)\n",
    "    res = np.expand_dims(-sm,-1)*sm\n",
    "    res[np.diag_indices_from(res)] = sm*(1-sm)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(0.35667494393873245)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_preds = np.array([0.2,0.7,0.1])\n",
    "test_actuals = np.array([0.,1.,0.])\n",
    "nnet.categorical_crossentropy(test_preds, test_actuals).eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.35667494393873245"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_entropy(test_preds, test_actuals)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "test_inp = T.dvector()\n",
    "test_out = nnet.categorical_crossentropy(test_inp, test_actuals)\n",
    "test_grad = theano.function([test_inp], T.grad(test_out, test_inp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.    , -1.4286, -0.    ])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_grad(test_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.    , -1.4286, -0.    ])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_entropy_d(test_preds, test_actuals)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "pre_pred = random(oh_x_rnn[0][0].shape)\n",
    "preds = softmax(pre_pred)\n",
    "actual = oh_x_rnn[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.allclose(softmax_d(pre_pred).dot(x_entropy_d(preds,actual)), preds-actual)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.2814,  0.464 ,  0.2546])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "softmax(test_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.2814,  0.464 ,  0.2546]])"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nnet.softmax(test_preds).eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "test_out = T.flatten(nnet.softmax(test_inp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.2022, -0.1306, -0.0717],\n",
       "       [-0.1306,  0.2487, -0.1181],\n",
       "       [-0.0717, -0.1181,  0.1898]])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_grad(test_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.2022, -0.1306, -0.0717],\n",
       "       [-0.1306,  0.2487, -0.1181],\n",
       "       [-0.0717, -0.1181,  0.1898]])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "softmax_d(test_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "act=relu\n",
    "act_d = relu_d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "loss=x_entropy\n",
    "loss_d=x_entropy_d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We also have to define our own scan function. Since we're not worrying about running things in parallel, it's very simple to implement:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def scan(fn, start, seq):\n",
    "    res = []\n",
    "    prev = start\n",
    "    for s in seq:\n",
    "        app = fn(prev, s)\n",
    "        res.append(app)\n",
    "        prev = app\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "...for instance, `scan` on `+` is the cumulative sum."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 3, 6, 10]"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scan(lambda prev,curr: prev+curr, 0, range(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Set up training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Let's now build the functions to do the forward and backward passes of our RNN. First, define our data and shape."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "inp = oh_x_rnn\n",
    "outp = oh_y_rnn\n",
    "n_input = vocab_size\n",
    "n_output = vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((75110, 8, 86), (75110, 8, 86))"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inp.shape, outp.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Here's the function to do a single forward pass of an RNN, for a single character."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def one_char(prev, item):\n",
    "    # Previous state\n",
    "    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev\n",
    "    # Current inputs and output\n",
    "    x, y = item\n",
    "    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)\n",
    "    hidden = act(pre_hidden)\n",
    "    pre_pred = np.dot(hidden,w_y)\n",
    "    ypred = softmax(pre_pred)\n",
    "    return (\n",
    "        # Keep track of loss so we can report it\n",
    "        tot_loss+loss(ypred, y),\n",
    "        # Used in backprop\n",
    "        pre_hidden, pre_pred, \n",
    "        # Used in next iteration\n",
    "        hidden, \n",
    "        # To provide predictions\n",
    "        ypred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We use `scan` to apply the above to a whole sequence of characters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def get_chars(n): return zip(inp[n], outp[n])\n",
    "def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Now we can define the backward step. We use a loop to go through every element of the sequence. The derivatives are applying the chain rule to each step, and accumulating the gradients across the sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# \"Columnify\" a vector\n",
    "def col(x): return x[:,newaxis]\n",
    "\n",
    "def one_bkwd(args, n):\n",
    "    global w_x,w_y,w_h\n",
    "\n",
    "    i=inp[n]  # 8x86\n",
    "    o=outp[n] # 8x86\n",
    "    d_pre_hidden = np.zeros(n_hidden) # 256\n",
    "    for p in reversed(range(len(i))):\n",
    "        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]\n",
    "        x=i[p] # 86\n",
    "        y=o[p] # 86\n",
    "        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86\n",
    "        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) \n",
    "                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256\n",
    "\n",
    "        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)\n",
    "        w_y -= col(hidden) * d_pre_pred * alpha\n",
    "        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)\n",
    "        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha\n",
    "        w_x -= col(x)*d_pre_hidden * alpha\n",
    "    return d_pre_hidden"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Now we can set up our initial weight matrices. Note that we're not using bias at all in this example, in order to keep things simpler."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "scale=math.sqrt(2./n_input)\n",
    "w_x = normal(scale=scale, size=(n_input,n_hidden))\n",
    "w_y = normal(scale=scale, size=(n_hidden, n_output))\n",
    "w_h = np.eye(n_hidden, dtype=np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our loop looks much like the theano loop in the previous section, except that we have to call the backwards step ourselves."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error:35.2380; Gradient:2.90002\n",
      "Error:32.9176; Gradient:2.71170\n",
      "Error:31.0649; Gradient:4.14135\n",
      "Error:29.9798; Gradient:3.40467\n",
      "Error:29.2453; Gradient:3.79049\n",
      "Error:29.0070; Gradient:3.39826\n",
      "Error:28.2358; Gradient:4.30422\n",
      "Error:28.0086; Gradient:2.92011\n",
      "Error:27.6885; Gradient:4.03503\n",
      "Error:27.6905; Gradient:3.18526\n"
     ]
    }
   ],
   "source": [
    "overallError=0\n",
    "alpha=0.0001\n",
    "for n in range(10000):\n",
    "    res = one_fwd(n)\n",
    "    overallError+=res[-1][0]\n",
    "    deriv = one_bkwd(res, n)\n",
    "    if(n % 1000 == 999):\n",
    "        print (\"Error:{:.4f}; Gradient:{:.5f}\".format(\n",
    "                overallError/1000, np.linalg.norm(deriv)))\n",
    "        overallError=0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "heading_collapsed": true
   },
   "source": [
    "## Keras GRU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Identical to the last keras rnn, but a GRU!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model=Sequential([\n",
    "        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),\n",
    "                  activation='relu', inner_init='identity'),\n",
    "        TimeDistributed(Dense(vocab_size, activation='softmax')),\n",
    "    ])\n",
    "model.compile(loss='categorical_crossentropy', optimizer=Adam())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/8\n",
      "75110/75110 [==============================] - 9s - loss: 2.3991     \n",
      "Epoch 2/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.9818     \n",
      "Epoch 3/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.8704     \n",
      "Epoch 4/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.8070     \n",
      "Epoch 5/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.7653     \n",
      "Epoch 6/8\n",
      "75110/75110 [==============================] - 10s - loss: 1.7346    \n",
      "Epoch 7/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.7108     \n",
      "Epoch 8/8\n",
      "75110/75110 [==============================] - 9s - loss: 1.6918     \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f820e8bae50>"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[' ', 't', 'h', 'i', 's', ' ', 'i', 's']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['t', 'h', 'e', 's', ' ', 'i', 's', ' ']"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nexts_oh(' this is')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Theano GRU"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Separate weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The theano GRU looks just like the simple theano RNN, except for the use of the reset and update gates. Each of these gates requires its own hidden and input weights, so we add those to our weight matrices."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "W_h = id_and_bias(n_hidden)\n",
    "W_x = init_wgts(n_input, n_hidden)\n",
    "W_y = wgts_and_bias(n_hidden, n_output)\n",
    "rW_h = init_wgts(n_hidden, n_hidden)\n",
    "rW_x = wgts_and_bias(n_input, n_hidden)\n",
    "uW_h = init_wgts(n_hidden, n_hidden)\n",
    "uW_x = wgts_and_bias(n_input, n_hidden)\n",
    "w_all = list(chain.from_iterable([W_h, W_y, uW_x, rW_x]))\n",
    "w_all.extend([W_x, uW_h, rW_h])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Here's the definition of a gate - it's just a sigmoid applied to the addition of the dot products of the input vectors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def gate(x, h, W_h, W_x, b_x):\n",
    "    return nnet.sigmoid(T.dot(x, W_x) + b_x + T.dot(h, W_h))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Our step is nearly identical to before, except that we multiply our hidden state by our reset gate, and we update our hidden state based on the update gate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def step(x, h, W_h, b_h, W_y, b_y, uW_x, ub_x, rW_x, rb_x, W_x, uW_h, rW_h):\n",
    "    reset = gate(x, h, rW_h, rW_x, rb_x)\n",
    "    update = gate(x, h, uW_h, uW_x, ub_x)\n",
    "    h_new = gate(x, h * reset, W_h, W_x, b_h)\n",
    "    h = update*h + (1-update)*h_new\n",
    "    y = nnet.softmax(T.dot(h, W_y) + b_y)\n",
    "    return h, T.flatten(y, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Everything from here on is identical to our simple RNN in theano."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "[v_h, v_y], _ = theano.scan(step, sequences=t_inp, \n",
    "                            outputs_info=[t_h0, None], non_sequences=w_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "error = nnet.categorical_crossentropy(v_y, t_outp).sum()\n",
    "g_all = T.grad(error, w_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "upd = upd_dict(w_all, g_all, lr)\n",
    "fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error:21.89\n",
      "Error:20.52\n",
      "Error:20.55\n",
      "Error:19.87\n",
      "Error:19.01\n",
      "Error:19.58\n",
      "Error:19.45\n",
      "Error:18.93\n",
      "Error:18.51\n",
      "Error:18.75\n",
      "Error:18.16\n",
      "Error:18.18\n",
      "Error:18.90\n",
      "Error:18.03\n",
      "Error:17.50\n",
      "Error:18.39\n",
      "Error:18.11\n",
      "Error:17.92\n",
      "Error:17.50\n",
      "Error:17.38\n",
      "Error:17.17\n",
      "Error:17.11\n",
      "Error:17.49\n",
      "Error:17.04\n",
      "Error:17.40\n",
      "Error:17.23\n",
      "Error:16.83\n",
      "Error:16.97\n",
      "Error:17.02\n",
      "Error:17.25\n",
      "Error:17.46\n",
      "Error:17.18\n",
      "Error:17.41\n",
      "Error:17.07\n",
      "Error:16.78\n",
      "Error:17.39\n",
      "Error:16.68\n",
      "Error:17.23\n",
      "Error:16.75\n",
      "Error:16.96\n"
     ]
    }
   ],
   "source": [
    "err=0.0; l_rate=0.1\n",
    "for i in range(len(X)): \n",
    "    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)\n",
    "    if i % 1000 == 999: \n",
    "        l_rate *= 0.95\n",
    "        print (\"Error:{:.2f}\".format(err/1000))\n",
    "        err=0.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Combined weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We can make the previous section simpler and faster by concatenating the hidden and input matrices and inputs together. We're not going to step through this cell by cell - you'll see it's identical to the previous section except for this concatenation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "W = (shared(np.concatenate([np.eye(n_hidden), normal(size=(n_input, n_hidden))])\n",
    "            .astype(np.float32)), init_bias(n_hidden))\n",
    "\n",
    "rW = wgts_and_bias(n_input+n_hidden, n_hidden)\n",
    "uW = wgts_and_bias(n_input+n_hidden, n_hidden)\n",
    "W_y = wgts_and_bias(n_hidden, n_output)\n",
    "w_all = list(chain.from_iterable([W, W_y, uW, rW]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def gate(m, W, b): return nnet.sigmoid(T.dot(m, W) + b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def step(x, h, W, b, W_y, b_y, uW, ub, rW, rb):\n",
    "    m = T.concatenate([h, x])\n",
    "    reset = gate(m, rW, rb)\n",
    "    update = gate(m, uW, ub)\n",
    "    m = T.concatenate([h*reset, x])\n",
    "    h_new = gate(m, W, b)\n",
    "    h = update*h + (1-update)*h_new\n",
    "    y = nnet.softmax(T.dot(h, W_y) + b_y)\n",
    "    return h, T.flatten(y, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "[v_h, v_y], _ = theano.scan(step, sequences=t_inp, \n",
    "                            outputs_info=[t_h0, None], non_sequences=w_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def upd_dict(wgts, grads, lr): \n",
    "    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "error = nnet.categorical_crossentropy(v_y, t_outp).sum()\n",
    "g_all = T.grad(error, w_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "upd = upd_dict(w_all, g_all, lr)\n",
    "fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error:24.71\n",
      "Error:22.16\n",
      "Error:21.99\n",
      "Error:21.26\n",
      "Error:20.44\n",
      "Error:20.97\n",
      "Error:20.69\n",
      "Error:20.15\n",
      "Error:19.91\n",
      "Error:20.26\n",
      "Error:19.54\n",
      "Error:19.64\n",
      "Error:20.26\n",
      "Error:19.49\n",
      "Error:18.95\n",
      "Error:19.94\n",
      "Error:19.71\n",
      "Error:19.56\n",
      "Error:18.95\n",
      "Error:18.78\n",
      "Error:18.46\n",
      "Error:18.50\n",
      "Error:19.02\n",
      "Error:18.45\n",
      "Error:18.72\n",
      "Error:18.50\n",
      "Error:18.27\n",
      "Error:18.31\n",
      "Error:18.29\n",
      "Error:18.46\n",
      "Error:18.75\n",
      "Error:18.33\n",
      "Error:18.58\n",
      "Error:18.24\n",
      "Error:17.95\n",
      "Error:18.53\n",
      "Error:17.82\n",
      "Error:18.36\n",
      "Error:17.87\n",
      "Error:18.01\n",
      "Error:17.32\n",
      "Error:17.70\n",
      "Error:17.54\n",
      "Error:17.87\n",
      "Error:17.79\n",
      "Error:17.84\n",
      "Error:17.59\n",
      "Error:17.78\n",
      "Error:17.65\n",
      "Error:17.75\n",
      "Error:17.09\n",
      "Error:17.31\n",
      "Error:16.71\n",
      "Error:16.77\n",
      "Error:17.38\n",
      "Error:17.22\n",
      "Error:16.70\n",
      "Error:17.28\n",
      "Error:17.00\n",
      "Error:16.85\n",
      "Error:16.62\n",
      "Error:17.06\n",
      "Error:16.88\n",
      "Error:16.71\n",
      "Error:16.46\n",
      "Error:16.49\n",
      "Error:16.23\n",
      "Error:16.44\n",
      "Error:16.98\n",
      "Error:16.37\n",
      "Error:16.79\n",
      "Error:16.32\n",
      "Error:16.12\n",
      "Error:16.13\n",
      "Error:16.11\n"
     ]
    }
   ],
   "source": [
    "err=0.0; l_rate=0.01\n",
    "for i in range(len(X)): \n",
    "    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)\n",
    "    if i % 1000 == 999: \n",
    "        print (\"Error:{:.2f}\".format(err/1000))\n",
    "        err=0.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### End"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  },
  "nav_menu": {},
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": true,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}