{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import necessary depencencies"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import text_normalizer as tn\n",
"import model_evaluation_utils as meu\n",
"\n",
"np.set_printoptions(precision=2, linewidth=80)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and normalize data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production.
The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n"
]
}
],
"source": [
"dataset = pd.read_csv(r'movie_reviews.csv')\n",
"\n",
"# take a peek at the data\n",
"print(dataset.head())\n",
"reviews = np.array(dataset['review'])\n",
"sentiments = np.array(dataset['sentiment'])\n",
"\n",
"# build train and test datasets\n",
"train_reviews = reviews[:35000]\n",
"train_sentiments = sentiments[:35000]\n",
"test_reviews = reviews[35000:]\n",
"test_sentiments = sentiments[35000:]\n",
"\n",
"# normalize datasets\n",
"norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
"norm_test_reviews = tn.normalize_corpus(test_reviews)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tokenize train & test datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]\n",
"tokenized_test = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build Vocabulary Mapping (word to index)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary Size: 82358\n",
"Sample slice of vocabulary map: {'martyrdom': 6, 'palmira': 7, 'servility': 8, 'gardening': 9, 'melodramatically': 73505, 'renfro': 41282, 'carlin': 41283, 'overtly': 41284, 'rend': 47891, 'anticlimactic': 51}\n"
]
}
],
"source": [
"from collections import Counter\n",
"\n",
"# build word to index vocabulary\n",
"token_counter = Counter([token for review in tokenized_train for token in review])\n",
"vocab_map = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}\n",
"max_index = np.max(list(vocab_map.values()))\n",
"vocab_map['PAD_INDEX'] = 0\n",
"vocab_map['NOT_FOUND_INDEX'] = max_index+1\n",
"vocab_size = len(vocab_map)\n",
"# view vocabulary size and part of the vocabulary map\n",
"print('Vocabulary Size:', vocab_size)\n",
"print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Encode and Pad datasets & Encode prediction class labels"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Max length of train review vectors: 1442\n",
"Train review vectors shape: (35000, 1442) Test review vectors shape: (15000, 1442)\n"
]
}
],
"source": [
"from keras.preprocessing import sequence\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"# get max length of train corpus and initialize label encoder\n",
"le = LabelEncoder()\n",
"num_classes=2 # positive -> 1, negative -> 0\n",
"max_len = np.max([len(review) for review in tokenized_train])\n",
"\n",
"## Train reviews data corpus\n",
"# Convert tokenized text reviews to numeric vectors\n",
"train_X = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_train]\n",
"train_X = sequence.pad_sequences(train_X, maxlen=max_len) # pad \n",
"## Train prediction class labels\n",
"# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n",
"train_y = le.fit_transform(train_sentiments)\n",
"\n",
"## Test reviews data corpus\n",
"# Convert tokenized text reviews to numeric vectors\n",
"test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX'] \n",
" for token in tokenized_review] \n",
" for tokenized_review in tokenized_test]\n",
"test_X = sequence.pad_sequences(test_X, maxlen=max_len)\n",
"## Test prediction class labels\n",
"# Convert text sentiment labels (negative\\positive) to binary encodings (0/1)\n",
"test_y = le.transform(test_sentiments)\n",
"\n",
"# view vector shapes\n",
"print('Max length of train review vectors:', max_len)\n",
"print('Train review vectors shape:', train_X.shape, ' Test review vectors shape:', test_X.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build the LSTM Model Architecture"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D\n",
"from keras.layers import LSTM\n",
"\n",
"EMBEDDING_DIM = 128 # dimension for dense embeddings for each token\n",
"LSTM_DIM = 64 # total LSTM units\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_len))\n",
"model.add(SpatialDropout1D(0.2))\n",
"model.add(LSTM(LSTM_DIM, dropout=0.2, recurrent_dropout=0.2))\n",
"model.add(Dense(1, activation=\"sigmoid\"))\n",
"\n",
"model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n",
" metrics=[\"accuracy\"])"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"embedding_4 (Embedding) (None, 1442, 128) 10541824 \n",
"_________________________________________________________________\n",
"spatial_dropout1d_4 (Spatial (None, 1442, 128) 0 \n",
"_________________________________________________________________\n",
"lstm_3 (LSTM) (None, 64) 49408 \n",
"_________________________________________________________________\n",
"dense_3 (Dense) (None, 1) 65 \n",
"=================================================================\n",
"Total params: 10,591,297\n",
"Trainable params: 10,591,297\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"None\n"
]
}
],
"source": [
"print(model.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Visualize model architecture"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
""
],
"text/plain": [
""
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import SVG\n",
"from keras.utils.vis_utils import model_to_dot\n",
"\n",
"SVG(model_to_dot(model, show_shapes=True, show_layer_names=False, \n",
" rankdir='LR').create(prog='dot', format='svg'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train the model"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 31500 samples, validate on 3500 samples\n",
"Epoch 1/5\n",
"31500/31500 [==============================] - 2491s - loss: 0.4081 - acc: 0.8184 - val_loss: 0.3006 - val_acc: 0.8751\n",
"Epoch 2/5\n",
"31500/31500 [==============================] - 2489s - loss: 0.2253 - acc: 0.9158 - val_loss: 0.3209 - val_acc: 0.8780\n",
"Epoch 3/5\n",
"31500/31500 [==============================] - 2656s - loss: 0.1431 - acc: 0.9493 - val_loss: 0.3483 - val_acc: 0.8671\n",
"Epoch 4/5\n",
"31500/31500 [==============================] - 2604s - loss: 0.1023 - acc: 0.9658 - val_loss: 0.3803 - val_acc: 0.8729\n",
"Epoch 5/5\n",
"31500/31500 [==============================] - 2701s - loss: 0.0694 - acc: 0.9761 - val_loss: 0.4430 - val_acc: 0.8706\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"batch_size = 100\n",
"model.fit(train_X, train_y, epochs=5, batch_size=batch_size, \n",
" shuffle=True, validation_split=0.1, verbose=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predict and Evaluate Model Performance"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15000/15000 [==============================] - 352s \n"
]
}
],
"source": [
"pred_test = model.predict_classes(test_X)\n",
"predictions = le.inverse_transform(pred_test.flatten())"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.88\n",
"Precision: 0.88\n",
"Recall: 0.88\n",
"F1 Score: 0.88\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.87 0.88 0.88 7510\n",
" negative 0.88 0.87 0.88 7490\n",
"\n",
"avg / total 0.88 0.88 0.88 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6633 877\n",
" negative 972 6518\n"
]
}
],
"source": [
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
" classes=['positive', 'negative']) "
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}