{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)\n", "/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.\n", " warnings.warn(warn)\n" ] } ], "source": [ "from theano.sandbox import cuda" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using Theano backend.\n" ] } ], "source": [ "%matplotlib inline\n", "import utils; reload(utils)\n", "from utils import *\n", "from __future__ import division, print_function" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "model_path = 'data/imdb/models/'\n", "%mkdir -p $model_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from keras.datasets import imdb\n", "idx = imdb.get_word_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is the word list:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idx_arr = sorted(idx, key=idx.get)\n", "idx_arr[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "...and this is the mapping from id to word" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "idx2word = {v: k for k, v in idx.iteritems()}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We download the reviews using code copied from keras.datasets:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl\n", "65298432/65552540 [============================>.] - ETA: 0s" ] } ], "source": [ "path = get_file('imdb_full.pkl',\n", " origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',\n", " md5_hash='d091312047c43cf9e4e38fef92437263')\n", "f = open(path, 'rb')\n", "(x_train, labels_train), (x_test, labels_test) = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "len(x_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the 1st review. As you see, the words have been replaced by ids. The ids can be looked up in idx2word." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "', '.join(map(str, x_train[0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The first word of the first review is 23022. Let's see what that is." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "idx2word[23022]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the whole review, mapped from ids to words." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "' '.join([idx2word[o] for o in x_train[0]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The labels are 1 for positive, 0 for negative." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels_train[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reduce vocab size by setting rare words to max index." ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vocab_size = 5000\n", "\n", "trn = [np.array([i if i