{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)\n",
      "/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.\n",
      "  warnings.warn(warn)\n"
     ]
    }
   ],
   "source": [
    "from theano.sandbox import cuda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import utils; reload(utils)\n",
    "from utils import *\n",
    "from __future__ import division, print_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_path = 'data/imdb/models/'\n",
    "%mkdir -p $model_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from keras.datasets import imdb\n",
    "idx = imdb.get_word_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the word list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx_arr = sorted(idx, key=idx.get)\n",
    "idx_arr[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "...and this is the mapping from id to word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "idx2word = {v: k for k, v in idx.iteritems()}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We download the reviews using code copied from keras.datasets:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl\n",
      "65298432/65552540 [============================>.] - ETA: 0s"
     ]
    }
   ],
   "source": [
    "path = get_file('imdb_full.pkl',\n",
    "                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',\n",
    "                md5_hash='d091312047c43cf9e4e38fef92437263')\n",
    "f = open(path, 'rb')\n",
    "(x_train, labels_train), (x_test, labels_test) = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(x_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's the 1st review. As you see, the words have been replaced by ids. The ids can be looked up in idx2word."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "', '.join(map(str, x_train[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first word of the first review is 23022. Let's see what that is."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "idx2word[23022]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's the whole review, mapped from ids to words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "' '.join([idx2word[o] for o in x_train[0]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The labels are 1 for positive, 0 for negative."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_train[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reduce vocab size by setting rare words to max index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocab_size = 5000\n",
    "\n",
    "trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]\n",
    "test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Look at distribution of lengths of sentences."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2493, 10, 237.71364)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lens = np.array(map(len, trn))\n",
    "(lens.max(), lens.min(), lens.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pad (with zero) or truncate each sentence to make consistent length."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "seq_len = 500\n",
    "\n",
    "trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)\n",
    "test = sequence.pad_sequences(test, maxlen=seq_len, value=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This results in nice rectangular matrices that can be passed to ML algorithms. Reviews shorter than 500 words are pre-padded with zeros, those greater are truncated."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 500)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create simple models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Single hidden layer NN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The simplest model that tends to give reasonable results is a single hidden layer net. So let's try that. Note that we can't expect to get any useful results by feeding word ids directly into a neural net - so instead we use an embedding to replace them with a vector of 32 (initially random) floats for each word in the vocab."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Sequential([\n",
    "    Embedding(vocab_size, 32, input_length=seq_len),\n",
    "    Flatten(),\n",
    "    Dense(100, activation='relu'),\n",
    "    Dropout(0.7),\n",
    "    Dense(1, activation='sigmoid')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "____________________________________________________________________________________________________\n",
      "Layer (type)                     Output Shape          Param #     Connected to                     \n",
      "====================================================================================================\n",
      "embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          \n",
      "____________________________________________________________________________________________________\n",
      "flatten_1 (Flatten)              (None, 16000)         0           embedding_1[0][0]                \n",
      "____________________________________________________________________________________________________\n",
      "dense_1 (Dense)                  (None, 100)           1600100     flatten_1[0][0]                  \n",
      "____________________________________________________________________________________________________\n",
      "dropout_1 (Dropout)              (None, 100)           0           dense_1[0][0]                    \n",
      "____________________________________________________________________________________________________\n",
      "dense_2 (Dense)                  (None, 1)             101         dropout_1[0][0]                  \n",
      "====================================================================================================\n",
      "Total params: 1760201\n",
      "____________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/2\n",
      "25000/25000 [==============================] - 1s - loss: 0.4651 - acc: 0.7495 - val_loss: 0.2830 - val_acc: 0.8804\n",
      "Epoch 2/2\n",
      "25000/25000 [==============================] - 1s - loss: 0.1969 - acc: 0.9265 - val_loss: 0.3195 - val_acc: 0.8694\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f0e084f4210>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The [stanford paper](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf) that this dataset is from cites a state of the art accuracy (without unlabelled data) of 0.883. So we're short of that, but on the right track."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "### Single conv layer with max pooling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "A CNN is likely to work better, since it's designed to take advantage of ordered data. We'll need to use a 1D CNN, since a sequence of words is 1D."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "conv1 = Sequential([\n",
    "    Embedding(vocab_size, 32, input_length=seq_len, dropout=0.2),\n",
    "    Dropout(0.2),\n",
    "    Convolution1D(64, 5, border_mode='same', activation='relu'),\n",
    "    Dropout(0.2),\n",
    "    MaxPooling1D(),\n",
    "    Flatten(),\n",
    "    Dense(100, activation='relu'),\n",
    "    Dropout(0.7),\n",
    "    Dense(1, activation='sigmoid')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/4\n",
      "25000/25000 [==============================] - 4s - loss: 0.4984 - acc: 0.7250 - val_loss: 0.2922 - val_acc: 0.8816\n",
      "Epoch 2/4\n",
      "25000/25000 [==============================] - 4s - loss: 0.2971 - acc: 0.8836 - val_loss: 0.2681 - val_acc: 0.8911\n",
      "Epoch 3/4\n",
      "25000/25000 [==============================] - 4s - loss: 0.2568 - acc: 0.8983 - val_loss: 0.2551 - val_acc: 0.8947\n",
      "Epoch 4/4\n",
      "25000/25000 [==============================] - 4s - loss: 0.2427 - acc: 0.9029 - val_loss: 0.2558 - val_acc: 0.8947\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f99cfa785d0>"
      ]
     },
     "execution_count": 278,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conv1.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=4, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "That's well past the Stanford paper's accuracy - another win for CNNs!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "conv1.save_weights(model_path + 'conv1.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "conv1.load_weights(model_path + 'conv1.h5')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Pre-trained vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "You may want to look at wordvectors.ipynb before moving on.\n",
    "\n",
    "In this section, we replicate the previous CNN, but using pre-trained embeddings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_glove_dataset(dataset):\n",
    "    \"\"\"Download the requested glove dataset from files.fast.ai\n",
    "    and return a location that can be passed to load_vectors.\n",
    "    \"\"\"\n",
    "    # see wordvectors.ipynb for info on how these files were\n",
    "    # generated from the original glove data.\n",
    "    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',\n",
    "               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',\n",
    "               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',\n",
    "               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}\n",
    "    glove_path = os.path.abspath('data/glove/results')\n",
    "    %mkdir -p $glove_path\n",
    "    return get_file(dataset,\n",
    "                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',\n",
    "                    cache_subdir=glove_path,\n",
    "                    md5_hash=md5sums.get(dataset, None),\n",
    "                    untar=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def load_vectors(loc):\n",
    "    return (load_array(loc+'.dat'),\n",
    "        pickle.load(open(loc+'_words.pkl','rb')),\n",
    "        pickle.load(open(loc+'_idx.pkl','rb')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from http://files.fast.ai/models/glove/6B.50d.tgz\n",
      "80101376/80107627 [============================>.] - ETA: 0sUntaring file...\n"
     ]
    }
   ],
   "source": [
    "vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def create_emb():\n",
    "    n_fact = vecs.shape[1]\n",
    "    emb = np.zeros((vocab_size, n_fact))\n",
    "\n",
    "    for i in range(1,len(emb)):\n",
    "        word = idx2word[i]\n",
    "        if word and re.match(r\"^[a-zA-Z0-9\\-]*$\", word):\n",
    "            src_idx = wordidx[word]\n",
    "            emb[i] = vecs[src_idx]\n",
    "        else:\n",
    "            # If we can't find the word in glove, randomly initialize\n",
    "            emb[i] = normal(scale=0.6, size=(n_fact,))\n",
    "\n",
    "    # This is our \"rare word\" id - we want to randomly initialize\n",
    "    emb[-1] = normal(scale=0.6, size=(n_fact,))\n",
    "    emb/=3\n",
    "    return emb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "emb = create_emb()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We pass our embedding matrix to the Embedding constructor, and set it to non-trainable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Sequential([\n",
    "    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, \n",
    "              weights=[emb], trainable=False),\n",
    "    Dropout(0.25),\n",
    "    Convolution1D(64, 5, border_mode='same', activation='relu'),\n",
    "    Dropout(0.25),\n",
    "    MaxPooling1D(),\n",
    "    Flatten(),\n",
    "    Dense(100, activation='relu'),\n",
    "    Dropout(0.7),\n",
    "    Dense(1, activation='sigmoid')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/2\n",
      "25000/25000 [==============================] - 4s - loss: 0.5217 - acc: 0.7172 - val_loss: 0.2942 - val_acc: 0.8815\n",
      "Epoch 2/2\n",
      "25000/25000 [==============================] - 4s - loss: 0.3169 - acc: 0.8719 - val_loss: 0.2662 - val_acc: 0.8978\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f0de0f2d910>"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We already have beaten our previous model! But let's fine-tune the embedding weights - especially since the words we couldn't find in glove just have random embeddings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.layers[0].trainable=True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr=1e-4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/1\n",
      "25000/25000 [==============================] - 4s - loss: 0.2751 - acc: 0.8911 - val_loss: 0.2500 - val_acc: 0.9008\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f0de0c4e0d0>"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=1, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "As expected, that's given us a nice little boost. :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.save_weights(model_path+'glove50.h5')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## Multi-size CNN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This is an implementation of a multi-size CNN as shown in Ben Bowles' [excellent blog post](https://quid.com/feed/how-quid-uses-deep-learning-with-small-data)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "from keras.layers import Merge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We use the functional API to create multiple conv layers of different sizes, and then concatenate them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "graph_in = Input ((vocab_size, 50))\n",
    "convs = [ ] \n",
    "for fsz in range (3, 6): \n",
    "    x = Convolution1D(64, fsz, border_mode='same', activation=\"relu\")(graph_in)\n",
    "    x = MaxPooling1D()(x) \n",
    "    x = Flatten()(x) \n",
    "    convs.append(x)\n",
    "out = Merge(mode=\"concat\")(convs) \n",
    "graph = Model(graph_in, out) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "emb = create_emb()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We then replace the conv/max-pool layer in our original CNN with the concatenated conv layers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model = Sequential ([\n",
    "    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.2, weights=[emb]),\n",
    "    Dropout (0.2),\n",
    "    graph,\n",
    "    Dropout (0.5),\n",
    "    Dense (100, activation=\"relu\"),\n",
    "    Dropout (0.7),\n",
    "    Dense (1, activation='sigmoid')\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/2\n",
      "25000/25000 [==============================] - 11s - loss: 0.3997 - acc: 0.8207 - val_loss: 0.3032 - val_acc: 0.8943\n",
      "Epoch 2/2\n",
      "25000/25000 [==============================] - 11s - loss: 0.2882 - acc: 0.8832 - val_loss: 0.2646 - val_acc: 0.9029\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f55b79b7990>"
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "Interestingly, I found that in this case I got best results when I started the embedding layer as being trainable, and then set it to non-trainable after a couple of epochs. I have no idea why!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.layers[0].trainable=False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {
    "collapsed": false,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.optimizer.lr=1e-5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/2\n",
      "25000/25000 [==============================] - 11s - loss: 0.2556 - acc: 0.8949 - val_loss: 0.2534 - val_acc: 0.9024\n",
      "Epoch 2/2\n",
      "25000/25000 [==============================] - 11s - loss: 0.2360 - acc: 0.9057 - val_loss: 0.2577 - val_acc: 0.9036\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f55b74de110>"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "This more complex architecture has given us another boost in accuracy."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## LSTM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "We haven't covered this bit yet!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "____________________________________________________________________________________________________\n",
      "Layer (type)                     Output Shape          Param #     Connected to                     \n",
      "====================================================================================================\n",
      "embedding_13 (Embedding)         (None, 500, 32)       160064      embedding_input_13[0][0]         \n",
      "____________________________________________________________________________________________________\n",
      "lstm_13 (LSTM)                   (None, 100)           53200       embedding_13[0][0]               \n",
      "____________________________________________________________________________________________________\n",
      "dense_18 (Dense)                 (None, 1)             101         lstm_13[0][0]                    \n",
      "====================================================================================================\n",
      "Total params: 213365\n",
      "____________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = Sequential([\n",
    "    Embedding(vocab_size, 32, input_length=seq_len, mask_zero=True,\n",
    "              W_regularizer=l2(1e-6), dropout=0.2),\n",
    "    LSTM(100, consume_less='gpu'),\n",
    "    Dense(1, activation='sigmoid')])\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false,
    "hidden": true,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/5\n",
      "25000/25000 [==============================] - 100s - loss: 0.5007 - acc: 0.7446 - val_loss: 0.3475 - val_acc: 0.8531\n",
      "Epoch 2/5\n",
      "25000/25000 [==============================] - 100s - loss: 0.3524 - acc: 0.8507 - val_loss: 0.3602 - val_acc: 0.8453\n",
      "Epoch 3/5\n",
      "25000/25000 [==============================] - 99s - loss: 0.3750 - acc: 0.8342 - val_loss: 0.4758 - val_acc: 0.7710\n",
      "Epoch 4/5\n",
      "25000/25000 [==============================] - 99s - loss: 0.3238 - acc: 0.8652 - val_loss: 0.3094 - val_acc: 0.8725\n",
      "Epoch 5/5\n",
      "25000/25000 [==============================] - 99s - loss: 0.2681 - acc: 0.8920 - val_loss: 0.3018 - val_acc: 0.8776\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f9a16b12c50>"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=5, batch_size=64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}