{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary depencencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import text_normalizer as tn\n", "import model_evaluation_utils as meu\n", "\n", "np.set_printoptions(precision=2, linewidth=80)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load and normalize data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive\n" ] } ], "source": [ "dataset = pd.read_csv(r'movie_reviews.csv')\n", "\n", "# take a peek at the data\n", "print(dataset.head())\n", "reviews = np.array(dataset['review'])\n", "sentiments = np.array(dataset['sentiment'])\n", "\n", "# build train and test datasets\n", "train_reviews = reviews[:35000]\n", "train_sentiments = sentiments[:35000]\n", "test_reviews = reviews[35000:]\n", "test_sentiments = sentiments[35000:]\n", "\n", "# normalize datasets\n", "norm_train_reviews = tn.normalize_corpus(train_reviews)\n", "norm_test_reviews = tn.normalize_corpus(test_reviews)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tokenize train & test datasets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]\n", "tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Vocabulary Mapping (word to index)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vocabulary Size: 82358\n", "Sample slice of vocabulary map: {'martyrdom': 6, 'palmira': 7, 'servility': 8, 'gardening': 9, 'melodramatically': 73505, 'renfro': 41282, 'carlin': 41283, 'overtly': 41284, 'rend': 47891, 'anticlimactic': 51}\n" ] } ], "source": [ "from collections import Counter\n", "\n", "# build word to index vocabulary\n", "token_counter = Counter([token for review in tokenized_train for token in review])\n", "vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}\n", "max_index = np.max(list(vocab_map.values()))\n", "vocab_map['PAD_INDEX'] = 0\n", "vocab_map['NOT_FOUND_INDEX'] = max_index+1\n", "vocab_size = len(vocab_map)\n", "# view vocabulary size and part of the vocabulary map\n", "print('Vocabulary Size:', vocab_size)\n", "print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encode and Pad datasets & Encode prediction class labels" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max length of train review vectors: 1442\n", "Train review vectors shape: (35000, 1442) Test review vectors shape: (15000, 1442)\n" ] } ], "source": [ "from keras.preprocessing import sequence\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "# get max length of train corpus and initialize label encoder\n", "le = LabelEncoder()\n", "num_classes=2 # positive -> 1, negative -> 0\n", "max_len = np.max([len(review) for review in tokenized_train])\n", "\n", "## Train reviews data corpus\n", "# Convert tokenized text reviews to numeric vectors\n", "train_X = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_train]\n", "train_X = sequence.pad_sequences(train_X, maxlen=max_len) # pad \n", "## Train prediction class labels\n", "# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n", "train_y = le.fit_transform(train_sentiments)\n", "\n", "## Test reviews data corpus\n", "# Convert tokenized text reviews to numeric vectors\n", "test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX'] \n", " for token in tokenized_review] \n", " for tokenized_review in tokenized_test]\n", "test_X = sequence.pad_sequences(test_X, maxlen=max_len)\n", "## Test prediction class labels\n", "# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n", "test_y = le.transform(test_sentiments)\n", "\n", "# view vector shapes\n", "print('Max length of train review vectors:', max_len)\n", "print('Train review vectors shape:', train_X.shape, ' Test review vectors shape:', test_X.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build the LSTM Model Architecture" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from keras.models import Sequential\n", "from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D\n", "from keras.layers import LSTM\n", "\n", "EMBEDDING_DIM = 128 # dimension for dense embeddings for each token\n", "LSTM_DIM = 64 # total LSTM units\n", "\n", "model = Sequential()\n", "model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_len))\n", "model.add(SpatialDropout1D(0.2))\n", "model.add(LSTM(LSTM_DIM, dropout=0.2, recurrent_dropout=0.2))\n", "model.add(Dense(1, activation=\"sigmoid\"))\n", "\n", "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n", " metrics=[\"accuracy\"])" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_4 (Embedding) (None, 1442, 128) 10541824 \n", "_________________________________________________________________\n", "spatial_dropout1d_4 (Spatial (None, 1442, 128) 0 \n", "_________________________________________________________________\n", "lstm_3 (LSTM) (None, 64) 49408 \n", "_________________________________________________________________\n", "dense_3 (Dense) (None, 1) 65 \n", "=================================================================\n", "Total params: 10,591,297\n", "Trainable params: 10,591,297\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ] } ], "source": [ "print(model.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize model architecture" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "G\n", "\n", "\n", "2972249086720\n", "\n", "InputLayer\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 1442)\n", "\n", "(None, 1442)\n", "\n", "\n", "2972249086552\n", "\n", "Embedding\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 1442)\n", "\n", "(None, 1442, 128)\n", "\n", "\n", "2972249086720->2972249086552\n", "\n", "\n", "\n", "\n", "2972249086608\n", "\n", "SpatialDropout1D\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 1442, 128)\n", "\n", "(None, 1442, 128)\n", "\n", "\n", "2972249086552->2972249086608\n", "\n", "\n", "\n", "\n", "2972249087392\n", "\n", "LSTM\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 1442, 128)\n", "\n", "(None, 64)\n", "\n", "\n", "2972249086608->2972249087392\n", "\n", "\n", "\n", "\n", "2972249154672\n", "\n", "Dense\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 64)\n", "\n", "(None, 1)\n", "\n", "\n", "2972249087392->2972249154672\n", "\n", "\n", "\n", "\n", "" ], "text/plain": [ "" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import SVG\n", "from keras.utils.vis_utils import model_to_dot\n", "\n", "SVG(model_to_dot(model, show_shapes=True, show_layer_names=False, \n", " rankdir='LR').create(prog='dot', format='svg'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train the model" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 31500 samples, validate on 3500 samples\n", "Epoch 1/5\n", "31500/31500 [==============================] - 2491s - loss: 0.4081 - acc: 0.8184 - val_loss: 0.3006 - val_acc: 0.8751\n", "Epoch 2/5\n", "31500/31500 [==============================] - 2489s - loss: 0.2253 - acc: 0.9158 - val_loss: 0.3209 - val_acc: 0.8780\n", "Epoch 3/5\n", "31500/31500 [==============================] - 2656s - loss: 0.1431 - acc: 0.9493 - val_loss: 0.3483 - val_acc: 0.8671\n", "Epoch 4/5\n", "31500/31500 [==============================] - 2604s - loss: 0.1023 - acc: 0.9658 - val_loss: 0.3803 - val_acc: 0.8729\n", "Epoch 5/5\n", "31500/31500 [==============================] - 2701s - loss: 0.0694 - acc: 0.9761 - val_loss: 0.4430 - val_acc: 0.8706\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 100\n", "model.fit(train_X, train_y, epochs=5, batch_size=batch_size, \n", " shuffle=True, validation_split=0.1, verbose=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predict and Evaluate Model Performance" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15000/15000 [==============================] - 352s \n" ] } ], "source": [ "pred_test = model.predict_classes(test_X)\n", "predictions = le.inverse_transform(pred_test.flatten())" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.88\n", "Precision: 0.88\n", "Recall: 0.88\n", "F1 Score: 0.88\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.87 0.88 0.88 7510\n", " negative 0.88 0.87 0.88 7490\n", "\n", "avg / total 0.88 0.88 0.88 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6633 877\n", " negative 972 6518\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n", " classes=['positive', 'negative']) " ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }