{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import necessary depencencies"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import text_normalizer as tn\n",
"import model_evaluation_utils as meu\n",
"\n",
"np.set_printoptions(precision=2, linewidth=80)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and normalize data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production.
The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n"
]
}
],
"source": [
"dataset = pd.read_csv(r'movie_reviews.csv')\n",
"\n",
"# take a peek at the data\n",
"print(dataset.head())\n",
"reviews = np.array(dataset['review'])\n",
"sentiments = np.array(dataset['sentiment'])\n",
"\n",
"# build train and test datasets\n",
"train_reviews = reviews[:35000]\n",
"train_sentiments = sentiments[:35000]\n",
"test_reviews = reviews[35000:]\n",
"test_sentiments = sentiments[35000:]\n",
"\n",
"# normalize datasets\n",
"norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
"norm_test_reviews = tn.normalize_corpus(test_reviews)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Traditional Supervised Machine Learning Models"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"\n",
"# build BOW features on train reviews\n",
"cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))\n",
"cv_train_features = cv.fit_transform(norm_train_reviews)\n",
"# build TFIDF features on train reviews\n",
"tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),\n",
" sublinear_tf=True)\n",
"tv_train_features = tv.fit_transform(norm_train_reviews)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# transform test reviews into features\n",
"cv_test_features = cv.transform(norm_test_reviews)\n",
"tv_test_features = tv.transform(norm_test_reviews)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BOW model:> Train features shape: (35000, 2114022) Test features shape: (15000, 2114022)\n",
"TFIDF model:> Train features shape: (35000, 2114022) Test features shape: (15000, 2114022)\n"
]
}
],
"source": [
"print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)\n",
"print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Training, Prediction and Performance Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
"\n",
"lr = LogisticRegression(penalty='l2', max_iter=100, C=1)\n",
"svm = SGDClassifier(loss='hinge', n_iter=100)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.91\n",
"Precision: 0.91\n",
"Recall: 0.91\n",
"F1 Score: 0.91\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.90 0.91 0.91 7510\n",
" negative 0.91 0.90 0.90 7490\n",
"\n",
"avg / total 0.91 0.91 0.91 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6817 693\n",
" negative 731 6759\n"
]
}
],
"source": [
"# Logistic Regression model on BOW features\n",
"lr_bow_predictions = meu.train_predict_model(classifier=lr, \n",
" train_features=cv_train_features, train_labels=train_sentiments,\n",
" test_features=cv_test_features, test_labels=test_sentiments)\n",
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,\n",
" classes=['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.9\n",
"Precision: 0.9\n",
"Recall: 0.9\n",
"F1 Score: 0.9\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.89 0.90 0.90 7510\n",
" negative 0.90 0.89 0.90 7490\n",
"\n",
"avg / total 0.90 0.90 0.90 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6780 730\n",
" negative 828 6662\n"
]
}
],
"source": [
"# Logistic Regression model on TF-IDF features\n",
"lr_tfidf_predictions = meu.train_predict_model(classifier=lr, \n",
" train_features=tv_train_features, train_labels=train_sentiments,\n",
" test_features=tv_test_features, test_labels=test_sentiments)\n",
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,\n",
" classes=['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.9\n",
"Precision: 0.9\n",
"Recall: 0.9\n",
"F1 Score: 0.9\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.90 0.89 0.90 7510\n",
" negative 0.90 0.91 0.90 7490\n",
"\n",
"avg / total 0.90 0.90 0.90 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6721 789\n",
" negative 711 6779\n"
]
}
],
"source": [
"svm_bow_predictions = meu.train_predict_model(classifier=svm, \n",
" train_features=cv_train_features, train_labels=train_sentiments,\n",
" test_features=cv_test_features, test_labels=test_sentiments)\n",
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,\n",
" classes=['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.9\n",
"Precision: 0.9\n",
"Recall: 0.9\n",
"F1 Score: 0.9\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.89 0.91 0.90 7510\n",
" negative 0.91 0.88 0.90 7490\n",
"\n",
"avg / total 0.90 0.90 0.90 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6839 671\n",
" negative 871 6619\n"
]
}
],
"source": [
"svm_tfidf_predictions = meu.train_predict_model(classifier=svm, \n",
" train_features=tv_train_features, train_labels=train_sentiments,\n",
" test_features=tv_test_features, test_labels=test_sentiments)\n",
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,\n",
" classes=['positive', 'negative'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Newer Supervised Deep Learning Models"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Program Files\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
" warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n",
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import gensim\n",
"import keras\n",
"from keras.models import Sequential\n",
"from keras.layers import Dropout, Activation, Dense\n",
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prediction class label encoding"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"le = LabelEncoder()\n",
"num_classes=2 \n",
"# tokenize train reviews & encode train labels\n",
"tokenized_train = [tn.tokenizer.tokenize(text)\n",
" for text in norm_train_reviews]\n",
"y_tr = le.fit_transform(train_sentiments)\n",
"y_train = keras.utils.to_categorical(y_tr, num_classes)\n",
"# tokenize test reviews & encode test labels\n",
"tokenized_test = [tn.tokenizer.tokenize(text)\n",
" for text in norm_test_reviews]\n",
"y_ts = le.fit_transform(test_sentiments)\n",
"y_test = keras.utils.to_categorical(y_ts, num_classes)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentiment class label map: {'positive': 1, 'negative': 0}\n",
"Sample test label transformation:\n",
"----------------------------------- \n",
"Actual Labels: ['negative' 'positive' 'negative'] \n",
"Encoded Labels: [0 1 0] \n",
"One hot encoded Labels:\n",
" [[ 1. 0.]\n",
" [ 0. 1.]\n",
" [ 1. 0.]]\n"
]
}
],
"source": [
"# print class label encoding map and encoded labels\n",
"print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))\n",
"print('Sample test label transformation:\\n'+'-'*35,\n",
" '\\nActual Labels:', test_sentiments[:3], '\\nEncoded Labels:', y_ts[:3], \n",
" '\\nOne hot encoded Labels:\\n', y_test[:3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Engineering with word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# build word2vec model\n",
"w2v_num_features = 500\n",
"w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,\n",
" min_count=10, sample=1e-3) "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def averaged_word2vec_vectorizer(corpus, model, num_features):\n",
" vocabulary = set(model.wv.index2word)\n",
" \n",
" def average_word_vectors(words, model, vocabulary, num_features):\n",
" feature_vector = np.zeros((num_features,), dtype=\"float64\")\n",
" nwords = 0.\n",
" \n",
" for word in words:\n",
" if word in vocabulary: \n",
" nwords = nwords + 1.\n",
" feature_vector = np.add(feature_vector, model[word])\n",
" if nwords:\n",
" feature_vector = np.divide(feature_vector, nwords)\n",
"\n",
" return feature_vector\n",
"\n",
" features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)\n",
" for tokenized_sentence in corpus]\n",
" return np.array(features)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# generate averaged word vector features from word2vec model\n",
"avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,\n",
" num_features=500)\n",
"avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,\n",
" num_features=500)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# feature engineering with GloVe model\n",
"train_nlp = [tn.nlp(item) for item in norm_train_reviews]\n",
"train_glove_features = np.array([item.vector for item in train_nlp])\n",
"\n",
"test_nlp = [tn.nlp(item) for item in norm_test_reviews]\n",
"test_glove_features = np.array([item.vector for item in test_nlp])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Word2Vec model:> Train features shape: (35000, 500) Test features shape: (15000, 500)\n",
"GloVe model:> Train features shape: (35000, 300) Test features shape: (15000, 300)\n"
]
}
],
"source": [
"print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)\n",
"print('GloVe model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modeling with deep neural networks "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Building Deep neural network architecture"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def construct_deepnn_architecture(num_input_features):\n",
" dnn_model = Sequential()\n",
" dnn_model.add(Dense(512, activation='relu', input_shape=(num_input_features,)))\n",
" dnn_model.add(Dropout(0.2))\n",
" dnn_model.add(Dense(512, activation='relu'))\n",
" dnn_model.add(Dropout(0.2))\n",
" dnn_model.add(Dense(512, activation='relu'))\n",
" dnn_model.add(Dropout(0.2))\n",
" dnn_model.add(Dense(2))\n",
" dnn_model.add(Activation('softmax'))\n",
"\n",
" dnn_model.compile(loss='categorical_crossentropy', optimizer='adam', \n",
" metrics=['accuracy'])\n",
" return dnn_model"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"w2v_dnn = construct_deepnn_architecture(num_input_features=500)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualize sample deep architecture"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
""
],
"text/plain": [
""
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import SVG\n",
"from keras.utils.vis_utils import model_to_dot\n",
"\n",
"SVG(model_to_dot(w2v_dnn, show_shapes=True, show_layer_names=False, \n",
" rankdir='TB').create(prog='dot', format='svg'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model Training, Prediction and Performance Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 31500 samples, validate on 3500 samples\n",
"Epoch 1/5\n",
"31500/31500 [==============================] - 11s - loss: 0.3097 - acc: 0.8720 - val_loss: 0.3159 - val_acc: 0.8646\n",
"Epoch 2/5\n",
"31500/31500 [==============================] - 11s - loss: 0.2869 - acc: 0.8819 - val_loss: 0.3024 - val_acc: 0.8743\n",
"Epoch 3/5\n",
"31500/31500 [==============================] - 11s - loss: 0.2778 - acc: 0.8857 - val_loss: 0.3012 - val_acc: 0.8763\n",
"Epoch 4/5\n",
"31500/31500 [==============================] - 11s - loss: 0.2708 - acc: 0.8901 - val_loss: 0.3041 - val_acc: 0.8734\n",
"Epoch 5/5\n",
"31500/31500 [==============================] - 11s - loss: 0.2612 - acc: 0.8920 - val_loss: 0.3023 - val_acc: 0.8763\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"batch_size = 100\n",
"w2v_dnn.fit(avg_wv_train_features, y_train, epochs=5, batch_size=batch_size, \n",
" shuffle=True, validation_split=0.1, verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14656/15000 [============================>.] - ETA: 0s"
]
}
],
"source": [
"y_pred = w2v_dnn.predict_classes(avg_wv_test_features)\n",
"predictions = le.inverse_transform(y_pred) "
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.88\n",
"Precision: 0.88\n",
"Recall: 0.88\n",
"F1 Score: 0.88\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.88 0.89 0.88 7510\n",
" negative 0.89 0.87 0.88 7490\n",
"\n",
"avg / total 0.88 0.88 0.88 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6711 799\n",
" negative 952 6538\n"
]
}
],
"source": [
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
" classes=['positive', 'negative']) "
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"glove_dnn = construct_deepnn_architecture(num_input_features=300)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 31500 samples, validate on 3500 samples\n",
"Epoch 1/5\n",
"31500/31500 [==============================] - 11s - loss: 0.4171 - acc: 0.8096 - val_loss: 0.3686 - val_acc: 0.8397\n",
"Epoch 2/5\n",
"31500/31500 [==============================] - 10s - loss: 0.3734 - acc: 0.8364 - val_loss: 0.4048 - val_acc: 0.8129\n",
"Epoch 3/5\n",
"31500/31500 [==============================] - 10s - loss: 0.3657 - acc: 0.8395 - val_loss: 0.3933 - val_acc: 0.8326\n",
"Epoch 4/5\n",
"31500/31500 [==============================] - 10s - loss: 0.3551 - acc: 0.8450 - val_loss: 0.3555 - val_acc: 0.8403\n",
"Epoch 5/5\n",
"31500/31500 [==============================] - 11s - loss: 0.3523 - acc: 0.8450 - val_loss: 0.3544 - val_acc: 0.8437\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"batch_size = 100\n",
"glove_dnn.fit(train_glove_features, y_train, epochs=5, batch_size=batch_size, \n",
" shuffle=True, validation_split=0.1, verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"14816/15000 [============================>.] - ETA: 0s"
]
}
],
"source": [
"y_pred = glove_dnn.predict_classes(test_glove_features)\n",
"predictions = le.inverse_transform(y_pred) "
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model Performance metrics:\n",
"------------------------------\n",
"Accuracy: 0.85\n",
"Precision: 0.85\n",
"Recall: 0.85\n",
"F1 Score: 0.85\n",
"\n",
"Model Classification report:\n",
"------------------------------\n",
" precision recall f1-score support\n",
"\n",
" positive 0.85 0.85 0.85 7510\n",
" negative 0.85 0.85 0.85 7490\n",
"\n",
"avg / total 0.85 0.85 0.85 15000\n",
"\n",
"\n",
"Prediction Confusion Matrix:\n",
"------------------------------\n",
" Predicted: \n",
" positive negative\n",
"Actual: positive 6370 1140\n",
" negative 1154 6336\n"
]
}
],
"source": [
"meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
" classes=['positive', 'negative']) "
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}