{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary depencencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import text_normalizer as tn\n", "import model_evaluation_utils as meu\n", "\n", "np.set_printoptions(precision=2, linewidth=80)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load and normalize data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive\n" ] } ], "source": [ "dataset = pd.read_csv(r'movie_reviews.csv')\n", "\n", "# take a peek at the data\n", "print(dataset.head())\n", "reviews = np.array(dataset['review'])\n", "sentiments = np.array(dataset['sentiment'])\n", "\n", "# build train and test datasets\n", "train_reviews = reviews[:35000]\n", "train_sentiments = sentiments[:35000]\n", "test_reviews = reviews[35000:]\n", "test_sentiments = sentiments[35000:]\n", "\n", "# normalize datasets\n", "norm_train_reviews = tn.normalize_corpus(train_reviews)\n", "norm_test_reviews = tn.normalize_corpus(test_reviews)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Traditional Supervised Machine Learning Models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Engineering" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", "# build BOW features on train reviews\n", "cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))\n", "cv_train_features = cv.fit_transform(norm_train_reviews)\n", "# build TFIDF features on train reviews\n", "tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),\n", " sublinear_tf=True)\n", "tv_train_features = tv.fit_transform(norm_train_reviews)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# transform test reviews into features\n", "cv_test_features = cv.transform(norm_test_reviews)\n", "tv_test_features = tv.transform(norm_test_reviews)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BOW model:> Train features shape: (35000, 2114022) Test features shape: (15000, 2114022)\n", "TFIDF model:> Train features shape: (35000, 2114022) Test features shape: (15000, 2114022)\n" ] } ], "source": [ "print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)\n", "print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Training, Prediction and Performance Evaluation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.linear_model import SGDClassifier, LogisticRegression\n", "\n", "lr = LogisticRegression(penalty='l2', max_iter=100, C=1)\n", "svm = SGDClassifier(loss='hinge', n_iter=100)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.91\n", "Precision: 0.91\n", "Recall: 0.91\n", "F1 Score: 0.91\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.90 0.91 0.91 7510\n", " negative 0.91 0.90 0.90 7490\n", "\n", "avg / total 0.91 0.91 0.91 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6817 693\n", " negative 731 6759\n" ] } ], "source": [ "# Logistic Regression model on BOW features\n", "lr_bow_predictions = meu.train_predict_model(classifier=lr, \n", " train_features=cv_train_features, train_labels=train_sentiments,\n", " test_features=cv_test_features, test_labels=test_sentiments)\n", "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,\n", " classes=['positive', 'negative'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.9\n", "Precision: 0.9\n", "Recall: 0.9\n", "F1 Score: 0.9\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.89 0.90 0.90 7510\n", " negative 0.90 0.89 0.90 7490\n", "\n", "avg / total 0.90 0.90 0.90 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6780 730\n", " negative 828 6662\n" ] } ], "source": [ "# Logistic Regression model on TF-IDF features\n", "lr_tfidf_predictions = meu.train_predict_model(classifier=lr, \n", " train_features=tv_train_features, train_labels=train_sentiments,\n", " test_features=tv_test_features, test_labels=test_sentiments)\n", "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,\n", " classes=['positive', 'negative'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.9\n", "Precision: 0.9\n", "Recall: 0.9\n", "F1 Score: 0.9\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.90 0.89 0.90 7510\n", " negative 0.90 0.91 0.90 7490\n", "\n", "avg / total 0.90 0.90 0.90 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6721 789\n", " negative 711 6779\n" ] } ], "source": [ "svm_bow_predictions = meu.train_predict_model(classifier=svm, \n", " train_features=cv_train_features, train_labels=train_sentiments,\n", " test_features=cv_test_features, test_labels=test_sentiments)\n", "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,\n", " classes=['positive', 'negative'])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.9\n", "Precision: 0.9\n", "Recall: 0.9\n", "F1 Score: 0.9\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.89 0.91 0.90 7510\n", " negative 0.91 0.88 0.90 7490\n", "\n", "avg / total 0.90 0.90 0.90 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6839 671\n", " negative 871 6619\n" ] } ], "source": [ "svm_tfidf_predictions = meu.train_predict_model(classifier=svm, \n", " train_features=tv_train_features, train_labels=train_sentiments,\n", " test_features=tv_test_features, test_labels=test_sentiments)\n", "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,\n", " classes=['positive', 'negative'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Newer Supervised Deep Learning Models" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Program Files\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n", "Using TensorFlow backend.\n" ] } ], "source": [ "import gensim\n", "import keras\n", "from keras.models import Sequential\n", "from keras.layers import Dropout, Activation, Dense\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction class label encoding" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "le = LabelEncoder()\n", "num_classes=2 \n", "# tokenize train reviews & encode train labels\n", "tokenized_train = [tn.tokenizer.tokenize(text)\n", " for text in norm_train_reviews]\n", "y_tr = le.fit_transform(train_sentiments)\n", "y_train = keras.utils.to_categorical(y_tr, num_classes)\n", "# tokenize test reviews & encode test labels\n", "tokenized_test = [tn.tokenizer.tokenize(text)\n", " for text in norm_test_reviews]\n", "y_ts = le.fit_transform(test_sentiments)\n", "y_test = keras.utils.to_categorical(y_ts, num_classes)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentiment class label map: {'positive': 1, 'negative': 0}\n", "Sample test label transformation:\n", "----------------------------------- \n", "Actual Labels: ['negative' 'positive' 'negative'] \n", "Encoded Labels: [0 1 0] \n", "One hot encoded Labels:\n", " [[ 1. 0.]\n", " [ 0. 1.]\n", " [ 1. 0.]]\n" ] } ], "source": [ "# print class label encoding map and encoded labels\n", "print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))\n", "print('Sample test label transformation:\\n'+'-'*35,\n", " '\\nActual Labels:', test_sentiments[:3], '\\nEncoded Labels:', y_ts[:3], \n", " '\\nOne hot encoded Labels:\\n', y_test[:3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Engineering with word embeddings" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# build word2vec model\n", "w2v_num_features = 500\n", "w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,\n", " min_count=10, sample=1e-3) " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def averaged_word2vec_vectorizer(corpus, model, num_features):\n", " vocabulary = set(model.wv.index2word)\n", " \n", " def average_word_vectors(words, model, vocabulary, num_features):\n", " feature_vector = np.zeros((num_features,), dtype=\"float64\")\n", " nwords = 0.\n", " \n", " for word in words:\n", " if word in vocabulary: \n", " nwords = nwords + 1.\n", " feature_vector = np.add(feature_vector, model[word])\n", " if nwords:\n", " feature_vector = np.divide(feature_vector, nwords)\n", "\n", " return feature_vector\n", "\n", " features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)\n", " for tokenized_sentence in corpus]\n", " return np.array(features)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# generate averaged word vector features from word2vec model\n", "avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,\n", " num_features=500)\n", "avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,\n", " num_features=500)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# feature engineering with GloVe model\n", "train_nlp = [tn.nlp(item) for item in norm_train_reviews]\n", "train_glove_features = np.array([item.vector for item in train_nlp])\n", "\n", "test_nlp = [tn.nlp(item) for item in norm_test_reviews]\n", "test_glove_features = np.array([item.vector for item in test_nlp])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Word2Vec model:> Train features shape: (35000, 500) Test features shape: (15000, 500)\n", "GloVe model:> Train features shape: (35000, 300) Test features shape: (15000, 300)\n" ] } ], "source": [ "print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)\n", "print('GloVe model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modeling with deep neural networks " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Building Deep neural network architecture" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def construct_deepnn_architecture(num_input_features):\n", " dnn_model = Sequential()\n", " dnn_model.add(Dense(512, activation='relu', input_shape=(num_input_features,)))\n", " dnn_model.add(Dropout(0.2))\n", " dnn_model.add(Dense(512, activation='relu'))\n", " dnn_model.add(Dropout(0.2))\n", " dnn_model.add(Dense(512, activation='relu'))\n", " dnn_model.add(Dropout(0.2))\n", " dnn_model.add(Dense(2))\n", " dnn_model.add(Activation('softmax'))\n", "\n", " dnn_model.compile(loss='categorical_crossentropy', optimizer='adam', \n", " metrics=['accuracy'])\n", " return dnn_model" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": true }, "outputs": [], "source": [ "w2v_dnn = construct_deepnn_architecture(num_input_features=500)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize sample deep architecture" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "G\n", "\n", "\n", "2610023760336\n", "\n", "InputLayer\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 500)\n", "\n", "(None, 500)\n", "\n", "\n", "2610023760112\n", "\n", "Dense\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 500)\n", "\n", "(None, 512)\n", "\n", "\n", "2610023760336->2610023760112\n", "\n", "\n", "\n", "\n", "2607322750760\n", "\n", "Dropout\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 512)\n", "\n", "\n", "2610023760112->2607322750760\n", "\n", "\n", "\n", "\n", "2607322750816\n", "\n", "Dense\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 512)\n", "\n", "\n", "2607322750760->2607322750816\n", "\n", "\n", "\n", "\n", "2612155375456\n", "\n", "Dropout\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 512)\n", "\n", "\n", "2607322750816->2612155375456\n", "\n", "\n", "\n", "\n", "2612155372992\n", "\n", "Dense\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 512)\n", "\n", "\n", "2612155375456->2612155372992\n", "\n", "\n", "\n", "\n", "2612155733440\n", "\n", "Dropout\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 512)\n", "\n", "\n", "2612155372992->2612155733440\n", "\n", "\n", "\n", "\n", "2612155736016\n", "\n", "Dense\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 512)\n", "\n", "(None, 2)\n", "\n", "\n", "2612155733440->2612155736016\n", "\n", "\n", "\n", "\n", "2612156093664\n", "\n", "Activation\n", "\n", "input:\n", "\n", "output:\n", "\n", "(None, 2)\n", "\n", "(None, 2)\n", "\n", "\n", "2612155736016->2612156093664\n", "\n", "\n", "\n", "\n", "" ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import SVG\n", "from keras.utils.vis_utils import model_to_dot\n", "\n", "SVG(model_to_dot(w2v_dnn, show_shapes=True, show_layer_names=False, \n", " rankdir='TB').create(prog='dot', format='svg'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model Training, Prediction and Performance Evaluation" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 31500 samples, validate on 3500 samples\n", "Epoch 1/5\n", "31500/31500 [==============================] - 11s - loss: 0.3097 - acc: 0.8720 - val_loss: 0.3159 - val_acc: 0.8646\n", "Epoch 2/5\n", "31500/31500 [==============================] - 11s - loss: 0.2869 - acc: 0.8819 - val_loss: 0.3024 - val_acc: 0.8743\n", "Epoch 3/5\n", "31500/31500 [==============================] - 11s - loss: 0.2778 - acc: 0.8857 - val_loss: 0.3012 - val_acc: 0.8763\n", "Epoch 4/5\n", "31500/31500 [==============================] - 11s - loss: 0.2708 - acc: 0.8901 - val_loss: 0.3041 - val_acc: 0.8734\n", "Epoch 5/5\n", "31500/31500 [==============================] - 11s - loss: 0.2612 - acc: 0.8920 - val_loss: 0.3023 - val_acc: 0.8763\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 100\n", "w2v_dnn.fit(avg_wv_train_features, y_train, epochs=5, batch_size=batch_size, \n", " shuffle=True, validation_split=0.1, verbose=1)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14656/15000 [============================>.] - ETA: 0s" ] } ], "source": [ "y_pred = w2v_dnn.predict_classes(avg_wv_test_features)\n", "predictions = le.inverse_transform(y_pred) " ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.88\n", "Precision: 0.88\n", "Recall: 0.88\n", "F1 Score: 0.88\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.88 0.89 0.88 7510\n", " negative 0.89 0.87 0.88 7490\n", "\n", "avg / total 0.88 0.88 0.88 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6711 799\n", " negative 952 6538\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n", " classes=['positive', 'negative']) " ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "glove_dnn = construct_deepnn_architecture(num_input_features=300)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 31500 samples, validate on 3500 samples\n", "Epoch 1/5\n", "31500/31500 [==============================] - 11s - loss: 0.4171 - acc: 0.8096 - val_loss: 0.3686 - val_acc: 0.8397\n", "Epoch 2/5\n", "31500/31500 [==============================] - 10s - loss: 0.3734 - acc: 0.8364 - val_loss: 0.4048 - val_acc: 0.8129\n", "Epoch 3/5\n", "31500/31500 [==============================] - 10s - loss: 0.3657 - acc: 0.8395 - val_loss: 0.3933 - val_acc: 0.8326\n", "Epoch 4/5\n", "31500/31500 [==============================] - 10s - loss: 0.3551 - acc: 0.8450 - val_loss: 0.3555 - val_acc: 0.8403\n", "Epoch 5/5\n", "31500/31500 [==============================] - 11s - loss: 0.3523 - acc: 0.8450 - val_loss: 0.3544 - val_acc: 0.8437\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 100\n", "glove_dnn.fit(train_glove_features, y_train, epochs=5, batch_size=batch_size, \n", " shuffle=True, validation_split=0.1, verbose=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14816/15000 [============================>.] - ETA: 0s" ] } ], "source": [ "y_pred = glove_dnn.predict_classes(test_glove_features)\n", "predictions = le.inverse_transform(y_pred) " ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Performance metrics:\n", "------------------------------\n", "Accuracy: 0.85\n", "Precision: 0.85\n", "Recall: 0.85\n", "F1 Score: 0.85\n", "\n", "Model Classification report:\n", "------------------------------\n", " precision recall f1-score support\n", "\n", " positive 0.85 0.85 0.85 7510\n", " negative 0.85 0.85 0.85 7490\n", "\n", "avg / total 0.85 0.85 0.85 15000\n", "\n", "\n", "Prediction Confusion Matrix:\n", "------------------------------\n", " Predicted: \n", " positive negative\n", "Actual: positive 6370 1140\n", " negative 1154 6336\n" ] } ], "source": [ "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n", " classes=['positive', 'negative']) " ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }