{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import necessary depencencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import text_normalizer as tn\n",
    "import model_evaluation_utils as meu\n",
    "\n",
    "np.set_printoptions(precision=2, linewidth=80)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and normalize data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                              review sentiment\n",
      "0  One of the other reviewers has mentioned that ...  positive\n",
      "1  A wonderful little production. <br /><br />The...  positive\n",
      "2  I thought this was a wonderful way to spend ti...  positive\n",
      "3  Basically there's a family where a little boy ...  negative\n",
      "4  Petter Mattei's \"Love in the Time of Money\" is...  positive\n"
     ]
    }
   ],
   "source": [
    "dataset = pd.read_csv(r'movie_reviews.csv')\n",
    "\n",
    "# take a peek at the data\n",
    "print(dataset.head())\n",
    "reviews = np.array(dataset['review'])\n",
    "sentiments = np.array(dataset['sentiment'])\n",
    "\n",
    "# build train and test datasets\n",
    "train_reviews = reviews[:35000]\n",
    "train_sentiments = sentiments[:35000]\n",
    "test_reviews = reviews[35000:]\n",
    "test_sentiments = sentiments[35000:]\n",
    "\n",
    "# normalize datasets\n",
    "norm_train_reviews = tn.normalize_corpus(train_reviews)\n",
    "norm_test_reviews = tn.normalize_corpus(test_reviews)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Traditional Supervised Machine Learning Models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "\n",
    "# build BOW features on train reviews\n",
    "cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))\n",
    "cv_train_features = cv.fit_transform(norm_train_reviews)\n",
    "# build TFIDF features on train reviews\n",
    "tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),\n",
    "                     sublinear_tf=True)\n",
    "tv_train_features = tv.fit_transform(norm_train_reviews)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# transform test reviews into features\n",
    "cv_test_features = cv.transform(norm_test_reviews)\n",
    "tv_test_features = tv.transform(norm_test_reviews)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BOW model:> Train features shape: (35000, 2114022)  Test features shape: (15000, 2114022)\n",
      "TFIDF model:> Train features shape: (35000, 2114022)  Test features shape: (15000, 2114022)\n"
     ]
    }
   ],
   "source": [
    "print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)\n",
    "print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Training, Prediction and Performance Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
    "\n",
    "lr = LogisticRegression(penalty='l2', max_iter=100, C=1)\n",
    "svm = SGDClassifier(loss='hinge', n_iter=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.91\n",
      "Precision: 0.91\n",
      "Recall: 0.91\n",
      "F1 Score: 0.91\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.90      0.91      0.91      7510\n",
      "   negative       0.91      0.90      0.90      7490\n",
      "\n",
      "avg / total       0.91      0.91      0.91     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6817      693\n",
      "        negative        731     6759\n"
     ]
    }
   ],
   "source": [
    "# Logistic Regression model on BOW features\n",
    "lr_bow_predictions = meu.train_predict_model(classifier=lr, \n",
    "                                             train_features=cv_train_features, train_labels=train_sentiments,\n",
    "                                             test_features=cv_test_features, test_labels=test_sentiments)\n",
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,\n",
    "                                      classes=['positive', 'negative'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.9\n",
      "Precision: 0.9\n",
      "Recall: 0.9\n",
      "F1 Score: 0.9\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.89      0.90      0.90      7510\n",
      "   negative       0.90      0.89      0.90      7490\n",
      "\n",
      "avg / total       0.90      0.90      0.90     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6780      730\n",
      "        negative        828     6662\n"
     ]
    }
   ],
   "source": [
    "# Logistic Regression model on TF-IDF features\n",
    "lr_tfidf_predictions = meu.train_predict_model(classifier=lr, \n",
    "                                               train_features=tv_train_features, train_labels=train_sentiments,\n",
    "                                               test_features=tv_test_features, test_labels=test_sentiments)\n",
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,\n",
    "                                      classes=['positive', 'negative'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.9\n",
      "Precision: 0.9\n",
      "Recall: 0.9\n",
      "F1 Score: 0.9\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.90      0.89      0.90      7510\n",
      "   negative       0.90      0.91      0.90      7490\n",
      "\n",
      "avg / total       0.90      0.90      0.90     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6721      789\n",
      "        negative        711     6779\n"
     ]
    }
   ],
   "source": [
    "svm_bow_predictions = meu.train_predict_model(classifier=svm, \n",
    "                                             train_features=cv_train_features, train_labels=train_sentiments,\n",
    "                                             test_features=cv_test_features, test_labels=test_sentiments)\n",
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_bow_predictions,\n",
    "                                      classes=['positive', 'negative'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.9\n",
      "Precision: 0.9\n",
      "Recall: 0.9\n",
      "F1 Score: 0.9\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.89      0.91      0.90      7510\n",
      "   negative       0.91      0.88      0.90      7490\n",
      "\n",
      "avg / total       0.90      0.90      0.90     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6839      671\n",
      "        negative        871     6619\n"
     ]
    }
   ],
   "source": [
    "svm_tfidf_predictions = meu.train_predict_model(classifier=svm, \n",
    "                                                train_features=tv_train_features, train_labels=train_sentiments,\n",
    "                                                test_features=tv_test_features, test_labels=test_sentiments)\n",
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=svm_tfidf_predictions,\n",
    "                                      classes=['positive', 'negative'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Newer Supervised Deep Learning Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
      "  warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import gensim\n",
    "import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dropout, Activation, Dense\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prediction class label encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "le = LabelEncoder()\n",
    "num_classes=2 \n",
    "# tokenize train reviews & encode train labels\n",
    "tokenized_train = [tn.tokenizer.tokenize(text)\n",
    "                   for text in norm_train_reviews]\n",
    "y_tr = le.fit_transform(train_sentiments)\n",
    "y_train = keras.utils.to_categorical(y_tr, num_classes)\n",
    "# tokenize test reviews & encode test labels\n",
    "tokenized_test = [tn.tokenizer.tokenize(text)\n",
    "                   for text in norm_test_reviews]\n",
    "y_ts = le.fit_transform(test_sentiments)\n",
    "y_test = keras.utils.to_categorical(y_ts, num_classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentiment class label map: {'positive': 1, 'negative': 0}\n",
      "Sample test label transformation:\n",
      "----------------------------------- \n",
      "Actual Labels: ['negative' 'positive' 'negative'] \n",
      "Encoded Labels: [0 1 0] \n",
      "One hot encoded Labels:\n",
      " [[ 1.  0.]\n",
      " [ 0.  1.]\n",
      " [ 1.  0.]]\n"
     ]
    }
   ],
   "source": [
    "# print class label encoding map and encoded labels\n",
    "print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))\n",
    "print('Sample test label transformation:\\n'+'-'*35,\n",
    "      '\\nActual Labels:', test_sentiments[:3], '\\nEncoded Labels:', y_ts[:3], \n",
    "      '\\nOne hot encoded Labels:\\n', y_test[:3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering with word embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# build word2vec model\n",
    "w2v_num_features = 500\n",
    "w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,\n",
    "                                   min_count=10, sample=1e-3)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def averaged_word2vec_vectorizer(corpus, model, num_features):\n",
    "    vocabulary = set(model.wv.index2word)\n",
    "    \n",
    "    def average_word_vectors(words, model, vocabulary, num_features):\n",
    "        feature_vector = np.zeros((num_features,), dtype=\"float64\")\n",
    "        nwords = 0.\n",
    "        \n",
    "        for word in words:\n",
    "            if word in vocabulary: \n",
    "                nwords = nwords + 1.\n",
    "                feature_vector = np.add(feature_vector, model[word])\n",
    "        if nwords:\n",
    "            feature_vector = np.divide(feature_vector, nwords)\n",
    "\n",
    "        return feature_vector\n",
    "\n",
    "    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)\n",
    "                    for tokenized_sentence in corpus]\n",
    "    return np.array(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# generate averaged word vector features from word2vec model\n",
    "avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,\n",
    "                                                     num_features=500)\n",
    "avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,\n",
    "                                                    num_features=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# feature engineering with GloVe model\n",
    "train_nlp = [tn.nlp(item) for item in norm_train_reviews]\n",
    "train_glove_features = np.array([item.vector for item in train_nlp])\n",
    "\n",
    "test_nlp = [tn.nlp(item) for item in norm_test_reviews]\n",
    "test_glove_features = np.array([item.vector for item in test_nlp])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Word2Vec model:> Train features shape: (35000, 500)  Test features shape: (15000, 500)\n",
      "GloVe model:> Train features shape: (35000, 300)  Test features shape: (15000, 300)\n"
     ]
    }
   ],
   "source": [
    "print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)\n",
    "print('GloVe model:> Train features shape:', train_glove_features.shape, ' Test features shape:', test_glove_features.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Modeling with deep neural networks "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Building Deep neural network architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def construct_deepnn_architecture(num_input_features):\n",
    "    dnn_model = Sequential()\n",
    "    dnn_model.add(Dense(512, activation='relu', input_shape=(num_input_features,)))\n",
    "    dnn_model.add(Dropout(0.2))\n",
    "    dnn_model.add(Dense(512, activation='relu'))\n",
    "    dnn_model.add(Dropout(0.2))\n",
    "    dnn_model.add(Dense(512, activation='relu'))\n",
    "    dnn_model.add(Dropout(0.2))\n",
    "    dnn_model.add(Dense(2))\n",
    "    dnn_model.add(Activation('softmax'))\n",
    "\n",
    "    dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',                 \n",
    "                      metrics=['accuracy'])\n",
    "    return dnn_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "w2v_dnn = construct_deepnn_architecture(num_input_features=500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize sample deep architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg height=\"719pt\" viewBox=\"0.00 0.00 224.00 719.00\" width=\"224pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 715)\">\n",
       "<title>G</title>\n",
       "<polygon fill=\"white\" points=\"-4,4 -4,-715 220,-715 220,4 -4,4\" stroke=\"none\"/>\n",
       "<!-- 2610023760336 -->\n",
       "<g class=\"node\" id=\"node1\"><title>2610023760336</title>\n",
       "<polygon fill=\"none\" points=\"0,-664.5 0,-710.5 216,-710.5 216,-664.5 0,-664.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-683.8\">InputLayer</text>\n",
       "<polyline fill=\"none\" points=\"77,-664.5 77,-710.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"105\" y=\"-695.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"77,-687.5 133,-687.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"105\" y=\"-672.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"133,-664.5 133,-710.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"174.5\" y=\"-695.3\">(None, 500)</text>\n",
       "<polyline fill=\"none\" points=\"133,-687.5 216,-687.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"174.5\" y=\"-672.3\">(None, 500)</text>\n",
       "</g>\n",
       "<!-- 2610023760112 -->\n",
       "<g class=\"node\" id=\"node2\"><title>2610023760112</title>\n",
       "<polygon fill=\"none\" points=\"13,-581.5 13,-627.5 203,-627.5 203,-581.5 13,-581.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-600.8\">Dense</text>\n",
       "<polyline fill=\"none\" points=\"64,-581.5 64,-627.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-612.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"64,-604.5 120,-604.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-589.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"120,-581.5 120,-627.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-612.3\">(None, 500)</text>\n",
       "<polyline fill=\"none\" points=\"120,-604.5 203,-604.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-589.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2610023760336&#45;&gt;2610023760112 -->\n",
       "<g class=\"edge\" id=\"edge1\"><title>2610023760336-&gt;2610023760112</title>\n",
       "<path d=\"M108,-664.366C108,-656.152 108,-646.658 108,-637.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-637.607 108,-627.607 104.5,-637.607 111.5,-637.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2607322750760 -->\n",
       "<g class=\"node\" id=\"node3\"><title>2607322750760</title>\n",
       "<polygon fill=\"none\" points=\"6.5,-498.5 6.5,-544.5 209.5,-544.5 209.5,-498.5 6.5,-498.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-517.8\">Dropout</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-498.5 70.5,-544.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-529.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-521.5 126.5,-521.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-506.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-498.5 126.5,-544.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-529.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-521.5 209.5,-521.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-506.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2610023760112&#45;&gt;2607322750760 -->\n",
       "<g class=\"edge\" id=\"edge2\"><title>2610023760112-&gt;2607322750760</title>\n",
       "<path d=\"M108,-581.366C108,-573.152 108,-563.658 108,-554.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-554.607 108,-544.607 104.5,-554.607 111.5,-554.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2607322750816 -->\n",
       "<g class=\"node\" id=\"node4\"><title>2607322750816</title>\n",
       "<polygon fill=\"none\" points=\"13,-415.5 13,-461.5 203,-461.5 203,-415.5 13,-415.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-434.8\">Dense</text>\n",
       "<polyline fill=\"none\" points=\"64,-415.5 64,-461.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-446.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"64,-438.5 120,-438.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-423.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"120,-415.5 120,-461.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-446.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"120,-438.5 203,-438.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-423.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2607322750760&#45;&gt;2607322750816 -->\n",
       "<g class=\"edge\" id=\"edge3\"><title>2607322750760-&gt;2607322750816</title>\n",
       "<path d=\"M108,-498.366C108,-490.152 108,-480.658 108,-471.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-471.607 108,-461.607 104.5,-471.607 111.5,-471.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2612155375456 -->\n",
       "<g class=\"node\" id=\"node5\"><title>2612155375456</title>\n",
       "<polygon fill=\"none\" points=\"6.5,-332.5 6.5,-378.5 209.5,-378.5 209.5,-332.5 6.5,-332.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-351.8\">Dropout</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-332.5 70.5,-378.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-363.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-355.5 126.5,-355.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-340.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-332.5 126.5,-378.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-363.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-355.5 209.5,-355.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-340.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2607322750816&#45;&gt;2612155375456 -->\n",
       "<g class=\"edge\" id=\"edge4\"><title>2607322750816-&gt;2612155375456</title>\n",
       "<path d=\"M108,-415.366C108,-407.152 108,-397.658 108,-388.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-388.607 108,-378.607 104.5,-388.607 111.5,-388.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2612155372992 -->\n",
       "<g class=\"node\" id=\"node6\"><title>2612155372992</title>\n",
       "<polygon fill=\"none\" points=\"13,-249.5 13,-295.5 203,-295.5 203,-249.5 13,-249.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-268.8\">Dense</text>\n",
       "<polyline fill=\"none\" points=\"64,-249.5 64,-295.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-280.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"64,-272.5 120,-272.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-257.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"120,-249.5 120,-295.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-280.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"120,-272.5 203,-272.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-257.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2612155375456&#45;&gt;2612155372992 -->\n",
       "<g class=\"edge\" id=\"edge5\"><title>2612155375456-&gt;2612155372992</title>\n",
       "<path d=\"M108,-332.366C108,-324.152 108,-314.658 108,-305.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-305.607 108,-295.607 104.5,-305.607 111.5,-305.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2612155733440 -->\n",
       "<g class=\"node\" id=\"node7\"><title>2612155733440</title>\n",
       "<polygon fill=\"none\" points=\"6.5,-166.5 6.5,-212.5 209.5,-212.5 209.5,-166.5 6.5,-166.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-185.8\">Dropout</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-166.5 70.5,-212.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-197.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"70.5,-189.5 126.5,-189.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"98.5\" y=\"-174.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-166.5 126.5,-212.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-197.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"126.5,-189.5 209.5,-189.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"168\" y=\"-174.3\">(None, 512)</text>\n",
       "</g>\n",
       "<!-- 2612155372992&#45;&gt;2612155733440 -->\n",
       "<g class=\"edge\" id=\"edge6\"><title>2612155372992-&gt;2612155733440</title>\n",
       "<path d=\"M108,-249.366C108,-241.152 108,-231.658 108,-222.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-222.607 108,-212.607 104.5,-222.607 111.5,-222.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2612155736016 -->\n",
       "<g class=\"node\" id=\"node8\"><title>2612155736016</title>\n",
       "<polygon fill=\"none\" points=\"13,-83.5 13,-129.5 203,-129.5 203,-83.5 13,-83.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"38.5\" y=\"-102.8\">Dense</text>\n",
       "<polyline fill=\"none\" points=\"64,-83.5 64,-129.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-114.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"64,-106.5 120,-106.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"92\" y=\"-91.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"120,-83.5 120,-129.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-114.3\">(None, 512)</text>\n",
       "<polyline fill=\"none\" points=\"120,-106.5 203,-106.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"161.5\" y=\"-91.3\">(None, 2)</text>\n",
       "</g>\n",
       "<!-- 2612155733440&#45;&gt;2612155736016 -->\n",
       "<g class=\"edge\" id=\"edge7\"><title>2612155733440-&gt;2612155736016</title>\n",
       "<path d=\"M108,-166.366C108,-158.152 108,-148.658 108,-139.725\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-139.607 108,-129.607 104.5,-139.607 111.5,-139.607\" stroke=\"black\"/>\n",
       "</g>\n",
       "<!-- 2612156093664 -->\n",
       "<g class=\"node\" id=\"node9\"><title>2612156093664</title>\n",
       "<polygon fill=\"none\" points=\"8,-0.5 8,-46.5 208,-46.5 208,-0.5 8,-0.5\" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"45\" y=\"-19.8\">Activation</text>\n",
       "<polyline fill=\"none\" points=\"82,-0.5 82,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"110\" y=\"-31.3\">input:</text>\n",
       "<polyline fill=\"none\" points=\"82,-23.5 138,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"110\" y=\"-8.3\">output:</text>\n",
       "<polyline fill=\"none\" points=\"138,-0.5 138,-46.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"173\" y=\"-31.3\">(None, 2)</text>\n",
       "<polyline fill=\"none\" points=\"138,-23.5 208,-23.5 \" stroke=\"black\"/>\n",
       "<text font-family=\"Times New Roman,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"173\" y=\"-8.3\">(None, 2)</text>\n",
       "</g>\n",
       "<!-- 2612155736016&#45;&gt;2612156093664 -->\n",
       "<g class=\"edge\" id=\"edge8\"><title>2612155736016-&gt;2612156093664</title>\n",
       "<path d=\"M108,-83.3664C108,-75.1516 108,-65.6579 108,-56.7252\" fill=\"none\" stroke=\"black\"/>\n",
       "<polygon fill=\"black\" points=\"111.5,-56.6068 108,-46.6068 104.5,-56.6069 111.5,-56.6068\" stroke=\"black\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import SVG\n",
    "from keras.utils.vis_utils import model_to_dot\n",
    "\n",
    "SVG(model_to_dot(w2v_dnn, show_shapes=True, show_layer_names=False, \n",
    "                 rankdir='TB').create(prog='dot', format='svg'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Training, Prediction and Performance Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 31500 samples, validate on 3500 samples\n",
      "Epoch 1/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.3097 - acc: 0.8720 - val_loss: 0.3159 - val_acc: 0.8646\n",
      "Epoch 2/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.2869 - acc: 0.8819 - val_loss: 0.3024 - val_acc: 0.8743\n",
      "Epoch 3/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.2778 - acc: 0.8857 - val_loss: 0.3012 - val_acc: 0.8763\n",
      "Epoch 4/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.2708 - acc: 0.8901 - val_loss: 0.3041 - val_acc: 0.8734\n",
      "Epoch 5/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.2612 - acc: 0.8920 - val_loss: 0.3023 - val_acc: 0.8763\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x260469dd470>"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch_size = 100\n",
    "w2v_dnn.fit(avg_wv_train_features, y_train, epochs=5, batch_size=batch_size, \n",
    "            shuffle=True, validation_split=0.1, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14656/15000 [============================>.] - ETA: 0s"
     ]
    }
   ],
   "source": [
    "y_pred = w2v_dnn.predict_classes(avg_wv_test_features)\n",
    "predictions = le.inverse_transform(y_pred) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.88\n",
      "Precision: 0.88\n",
      "Recall: 0.88\n",
      "F1 Score: 0.88\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.88      0.89      0.88      7510\n",
      "   negative       0.89      0.87      0.88      7490\n",
      "\n",
      "avg / total       0.88      0.88      0.88     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6711      799\n",
      "        negative        952     6538\n"
     ]
    }
   ],
   "source": [
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
    "                                      classes=['positive', 'negative'])  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "glove_dnn = construct_deepnn_architecture(num_input_features=300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 31500 samples, validate on 3500 samples\n",
      "Epoch 1/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.4171 - acc: 0.8096 - val_loss: 0.3686 - val_acc: 0.8397\n",
      "Epoch 2/5\n",
      "31500/31500 [==============================] - 10s - loss: 0.3734 - acc: 0.8364 - val_loss: 0.4048 - val_acc: 0.8129\n",
      "Epoch 3/5\n",
      "31500/31500 [==============================] - 10s - loss: 0.3657 - acc: 0.8395 - val_loss: 0.3933 - val_acc: 0.8326\n",
      "Epoch 4/5\n",
      "31500/31500 [==============================] - 10s - loss: 0.3551 - acc: 0.8450 - val_loss: 0.3555 - val_acc: 0.8403\n",
      "Epoch 5/5\n",
      "31500/31500 [==============================] - 11s - loss: 0.3523 - acc: 0.8450 - val_loss: 0.3544 - val_acc: 0.8437\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x26033f1fa58>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch_size = 100\n",
    "glove_dnn.fit(train_glove_features, y_train, epochs=5, batch_size=batch_size, \n",
    "              shuffle=True, validation_split=0.1, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14816/15000 [============================>.] - ETA: 0s"
     ]
    }
   ],
   "source": [
    "y_pred = glove_dnn.predict_classes(test_glove_features)\n",
    "predictions = le.inverse_transform(y_pred) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model Performance metrics:\n",
      "------------------------------\n",
      "Accuracy: 0.85\n",
      "Precision: 0.85\n",
      "Recall: 0.85\n",
      "F1 Score: 0.85\n",
      "\n",
      "Model Classification report:\n",
      "------------------------------\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "   positive       0.85      0.85      0.85      7510\n",
      "   negative       0.85      0.85      0.85      7490\n",
      "\n",
      "avg / total       0.85      0.85      0.85     15000\n",
      "\n",
      "\n",
      "Prediction Confusion Matrix:\n",
      "------------------------------\n",
      "                 Predicted:         \n",
      "                   positive negative\n",
      "Actual: positive       6370     1140\n",
      "        negative       1154     6336\n"
     ]
    }
   ],
   "source": [
    "meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, \n",
    "                                      classes=['positive', 'negative'])  "
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}