{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from keras.preprocessing import sequence\n", "from keras.utils import np_utils\n", "from keras.models import Sequential\n", "from keras.layers.core import Dense, Dropout, Activation\n", "from keras.layers.embeddings import Embedding\n", "from keras.layers.recurrent import LSTM\n", "from keras.datasets import imdb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# IMDB Movie reviews sentiment classification - Keras (Backend: TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## IMDB Movie reviews Data\n", "\n", "Kerasの提供するデータセットを利用: \n", "http://keras.io/datasets/#imdb-movie-reviews-sentiment-classification" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading data from https://s3.amazonaws.com/text-datasets/imdb.pkl\n", "33218560/33213513 [==============================] - 11s \n", "train sequences: 20000\n", "test sequences: 5000\n" ] } ], "source": [ "max_features = 20000\n", "(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)\n", "\n", "print 'train sequences: {0}'.format(len(X_train))\n", "print 'test sequences: {0}'.format(len(X_test))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[1, 20, 28, 716, 48, 495, 79, 27, 493, 8]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# データの中身\n", "# 各レビューはWord Indexのシーケンスとして符号化されている\n", "# Indexは頻度に基づき符号化されているため,例えば3番めに頻度の高いWordにはIndex「3」が付与される\n", "\n", "X_train[0][:10]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train shape: (20000, 100)\n", "X_test shape: (5000, 100)\n" ] } ], "source": [ "# テキストは最長100におさめる\n", "maxlen = 100\n", "\n", "X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n", "X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n", "\n", "print 'X_train shape: {0}'.format(X_train.shape)\n", "print 'X_test shape: {0}'.format(X_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implementing" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "model = Sequential()\n", "model.add(Embedding(max_features, 128, input_length=maxlen))\n", "model.add(LSTM(128))\n", "model.add(Dropout(0.5))\n", "model.add(Dense(1))\n", "model.add(Activation('sigmoid'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/amacbee/Dropbox/github/keras-conversational/lib/python2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility\n", " from scan_perform.scan_perform import *\n" ] } ], "source": [ "model.compile(loss='binary_crossentropy', optimizer='adam', class_mode=\"binary\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 20000 samples, validate on 5000 samples\n", "Epoch 1/3\n", "20000/20000 [==============================] - 301s - loss: 0.4845 - acc: 0.7713 - val_loss: 0.4116 - val_acc: 0.8266\n", "Epoch 2/3\n", "20000/20000 [==============================] - 299s - loss: 0.2720 - acc: 0.8931 - val_loss: 0.3630 - val_acc: 0.8420\n", "Epoch 3/3\n", "20000/20000 [==============================] - 315s - loss: 0.1756 - acc: 0.9372 - val_loss: 0.4106 - val_acc: 0.8370\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 32\n", "model.fit(X_train, y_train, batch_size=32, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluating" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5000/5000 [==============================] - 12s \n", "[0.41059308743695982, 0.83699999999999997]\n" ] } ], "source": [ "print(model.evaluate(X_test, y_test, batch_size=32, show_accuracy=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }