{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "xVJt8aHpqogQ",
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Leveraging-Pre-trained-Word-Embedding-for-Text-Classification\" data-toc-modified-id=\"Leveraging-Pre-trained-Word-Embedding-for-Text-Classification-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Leveraging Pre-trained Word Embedding for Text Classification</a></span><ul class=\"toc-item\"><li><span><a href=\"#Data-Preparation\" data-toc-modified-id=\"Data-Preparation-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Data Preparation</a></span></li><li><span><a href=\"#Glove\" data-toc-modified-id=\"Glove-1.2\"><span class=\"toc-item-num\">1.2&nbsp;&nbsp;</span>Glove</a></span></li><li><span><a href=\"#Model\" data-toc-modified-id=\"Model-1.3\"><span class=\"toc-item-num\">1.3&nbsp;&nbsp;</span>Model</a></span><ul class=\"toc-item\"><li><span><a href=\"#Model-with-Pretrained-Embedding\" data-toc-modified-id=\"Model-with-Pretrained-Embedding-1.3.1\"><span class=\"toc-item-num\">1.3.1&nbsp;&nbsp;</span>Model with Pretrained Embedding</a></span></li><li><span><a href=\"#Model-without-Pretrained-Embedding\" data-toc-modified-id=\"Model-without-Pretrained-Embedding-1.3.2\"><span class=\"toc-item-num\">1.3.2&nbsp;&nbsp;</span>Model without Pretrained Embedding</a></span></li></ul></li><li><span><a href=\"#Submission\" data-toc-modified-id=\"Submission-1.4\"><span class=\"toc-item-num\">1.4&nbsp;&nbsp;</span>Submission</a></span></li><li><span><a href=\"#Summary\" data-toc-modified-id=\"Summary-1.5\"><span class=\"toc-item-num\">1.5&nbsp;&nbsp;</span>Summary</a></span></li></ul></li><li><span><a href=\"#Reference\" data-toc-modified-id=\"Reference-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Reference</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 279
    },
    "colab_type": "code",
    "id": "ddrQBmF1rUHA",
    "outputId": "573a2553-6b5f-4fb1-fba0-902b9e2b5f1e"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       "@import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
       "@import url('http://fonts.googleapis.com/css?family=Vollkorn');\n",
       "@import url('http://fonts.googleapis.com/css?family=Arimo');\n",
       "@import url('http://fonts.googleapis.com/css?family=Fira_sans');\n",
       "    \n",
       "    div.cell {\n",
       "        width: 1000px;\n",
       "        margin-left: 0% !important;\n",
       "        margin-right: auto;\n",
       "    }\n",
       "    div.text_cell code {\n",
       "        background: transparent;\n",
       "        color: #000000;\n",
       "        font-weight: 600;\n",
       "        font-size: 12pt;\n",
       "        font-style: bold;\n",
       "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
       "    }\n",
       "    h1 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "\t}\n",
       "\t\n",
       "    div.input_area {\n",
       "        background: #F6F6F9;\n",
       "        border: 1px solid #586e75;\n",
       "    }\n",
       "\n",
       "    .text_cell_render h1 {\n",
       "        font-weight: 200;\n",
       "        font-size: 30pt;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h2 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "        text-align: left;\n",
       "    }\n",
       "    .text_cell_render h2 {\n",
       "        font-weight: 200;\n",
       "        font-size: 16pt;\n",
       "        font-style: italic;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1.5em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h3 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h3 {\n",
       "        font-weight: 200;\n",
       "        font-size: 14pt;\n",
       "        line-height: 100%;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 2em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    }\n",
       "    h4 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h4 {\n",
       "        font-weight: 100;\n",
       "        font-size: 14pt;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    h5 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h5 {\n",
       "        font-weight: 200;\n",
       "        font-style: normal;\n",
       "        color: #1d3b84;\n",
       "        font-size: 16pt;\n",
       "        margin-bottom: 0em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    div.text_cell_render{\n",
       "        font-family: 'Fira sans', verdana,arial,sans-serif;\n",
       "        line-height: 125%;\n",
       "        font-size: 115%;\n",
       "        text-align:justify;\n",
       "        text-justify:inter-word;\n",
       "    }\n",
       "    div.output_wrapper{\n",
       "        margin-top:0.2em;\n",
       "        margin-bottom:0.2em;\n",
       "    }\n",
       "\n",
       "    code{\n",
       "      font-size: 70%;\n",
       "    }\n",
       "    .rendered_html code{\n",
       "    background-color: transparent;\n",
       "    }\n",
       "    ul{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li li{\n",
       "        padding-left: 0.2em; \n",
       "        margin-bottom: 0.2em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    ol{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ol li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    a:link{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:visited{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:hover{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:focus{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:active{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    .rendered_html :link {\n",
       "       text-decoration: underline; \n",
       "    }\n",
       "    .rendered_html :hover {\n",
       "       text-decoration: none; \n",
       "    }\n",
       "    .rendered_html :visited {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :focus {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :active {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .warning{\n",
       "        color: rgb( 240, 20, 20 )\n",
       "    } \n",
       "    hr {\n",
       "      color: #f3f3f3;\n",
       "      background-color: #f3f3f3;\n",
       "      height: 1px;\n",
       "    }\n",
       "    blockquote{\n",
       "      display:block;\n",
       "      background: #fcfcfc;\n",
       "      border-left: 5px solid #c76c0c;\n",
       "      font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "      width:680px;\n",
       "      padding: 10px 10px 10px 10px;\n",
       "      text-align:justify;\n",
       "      text-justify:inter-word;\n",
       "      }\n",
       "      blockquote p {\n",
       "        margin-bottom: 0;\n",
       "        line-height: 125%;\n",
       "        font-size: 100%;\n",
       "      }\n",
       "</style>\n",
       "<script>\n",
       "    MathJax.Hub.Config({\n",
       "                        TeX: {\n",
       "                           extensions: [\"AMSmath.js\"]\n",
       "                           },\n",
       "                tex2jax: {\n",
       "                    inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n",
       "                    displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n",
       "                },\n",
       "                displayAlign: 'center', // Change this to 'center' to center equations.\n",
       "                \"HTML-CSS\": {\n",
       "                    scale:100,\n",
       "                        availableFonts: [],\n",
       "                        preferredFont:null,\n",
       "                        webFont: \"TeX\",\n",
       "                    styles: {'.MathJax_Display': {\"margin\": 4}}\n",
       "                }\n",
       "        });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# code for loading the format for the notebook\n",
    "import os\n",
    "\n",
    "# path : store the current path to convert back to it later\n",
    "path = os.getcwd()\n",
    "os.chdir(os.path.join('..', '..', 'notebook_format'))\n",
    "\n",
    "from formats import load_style\n",
    "load_style(plot_style=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 254
    },
    "colab_type": "code",
    "id": "R_ZoURXsqogT",
    "outputId": "fbe85c03-9c5e-41e8-bcc1-a50d6776b685"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ethen 2019-12-09 11:59:08 \n",
      "\n",
      "CPython 3.6.4\n",
      "IPython 7.9.0\n",
      "\n",
      "numpy 1.16.5\n",
      "pandas 0.25.0\n",
      "sklearn 0.21.2\n",
      "keras 2.2.2\n"
     ]
    }
   ],
   "source": [
    "os.chdir(path)\n",
    "\n",
    "# 1. magic for inline plot\n",
    "# 2. magic to print version\n",
    "# 3. magic so that the notebook will reload external python modules\n",
    "# 4. magic to enable retina (high resolution) plots\n",
    "# https://gist.github.com/minrk/3301035\n",
    "%matplotlib inline\n",
    "%load_ext watermark\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from typing import List, Tuple, Dict\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras import layers\n",
    "from keras.models import Model\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "# prevent scientific notations\n",
    "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n",
    "\n",
    "%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "CmvuANtZqogY"
   },
   "source": [
    "# Leveraging Pre-trained Word Embedding for Text Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "PfMtWwMtqogZ"
   },
   "source": [
    "There are two main ways to obtain word embeddings:\n",
    "\n",
    "- Learn it from scratch: We specify a neural network architecture and learn the word embeddings jointly with the main task at our hand (e.g. sentiment classification). i.e. we would start off with some random word embeddings, and it would update itself along with the word embeddings.\n",
    "- Transfer Learning: The whole idea behind transfer learning is to avoid reinventing the wheel as much as possible. It gives us the capability to transfer knowledge that was gained/learned in some other task and use it to improve the learning of another related task. In practice, one way to do this is for the embedding part of the neural network architecture, we load some other embeddings that were trained on a different machine learning task than the one we are trying to solve and use that to bootstrap the process.\n",
    "\n",
    "One area that transfer learning shines is when we have little training data available and using our data alone might not be enough to learn an appropriate task specific embedding/features for our vocabulary. In this case, leveraging a word embedding that captures generic aspect of the language can prove to be beneficial from both a performance and time perspective (i.e. we won't have to spend hours/days training a model from scratch to achieve a similar performance). Keep in mind that, as with all machine learning application, everything is still all about trial and error. What makes a embedding good depends heavily on the task at hand: The word embedding for a movie review sentiment classification model may look very different from a legal document classification model as the semantic of the corpus varies between these two tasks."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "suQh5v7Bqoga"
   },
   "source": [
    "## Data Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "kMJCwvDnqoga"
   },
   "source": [
    "We'll use the movie review sentiment analysis dataset from [Kaggle](https://www.kaggle.com/c/word2vec-nlp-tutorial/overview) for this example. It's a binary classification problem with AUC as the ultimate evaluation metric. The next few code chunk performs the usual text preprocessing, build up the word vocabulary and performing a train/test split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YINhhEhOJzjX"
   },
   "outputs": [],
   "source": [
    "data_dir = 'data'\n",
    "submission_dir = 'submission'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 215
    },
    "colab_type": "code",
    "id": "PJXFFj-jqogb",
    "outputId": "9e6cd63f-7852-4185-98d4-f72533f2a1ed"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5814_8</td>\n",
       "      <td>1</td>\n",
       "      <td>With all this stuff going down at the moment w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2381_9</td>\n",
       "      <td>1</td>\n",
       "      <td>\\The Classic War of the Worlds\\\" by Timothy Hi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7759_3</td>\n",
       "      <td>0</td>\n",
       "      <td>The film starts with a manager (Nicholas Bell)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3630_4</td>\n",
       "      <td>0</td>\n",
       "      <td>It must be assumed that those who praised this...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9495_8</td>\n",
       "      <td>1</td>\n",
       "      <td>Superbly trashy and wondrously unpretentious 8...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  sentiment                                             review\n",
       "0  5814_8          1  With all this stuff going down at the moment w...\n",
       "1  2381_9          1  \\The Classic War of the Worlds\\\" by Timothy Hi...\n",
       "2  7759_3          0  The film starts with a manager (Nicholas Bell)...\n",
       "3  3630_4          0  It must be assumed that those who praised this...\n",
       "4  9495_8          1  Superbly trashy and wondrously unpretentious 8..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'labeledTrainData.tsv')\n",
    "df = pd.read_csv(input_path, delimiter='\\t')\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 55
    },
    "colab_type": "code",
    "id": "gr5Kmwnaqoge",
    "outputId": "13836e57-f9c7-42ca-deff-93f2616e80af"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.<br /><br />Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.<br /><br />Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\""
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_text = df['review'].iloc[0]\n",
    "raw_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Fm60Nd2rqogh"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def clean_str(string: str) -> str:\n",
    "    string = re.sub(r\"\\\\\", \"\", string)    \n",
    "    string = re.sub(r\"\\'\", \"\", string)    \n",
    "    string = re.sub(r\"\\\"\", \"\", string)    \n",
    "    return string.strip().lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "uGJqdqO5qogj"
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "def clean_text(df: pd.DataFrame,\n",
    "               text_col: str,\n",
    "               label_col: str) -> Tuple[List[str], List[int]]:\n",
    "    texts = []\n",
    "    labels = []\n",
    "    for raw_text, label in zip(df[text_col], df[label_col]):  \n",
    "        text = BeautifulSoup(raw_text).get_text()\n",
    "        cleaned_text = clean_str(text)\n",
    "        texts.append(cleaned_text)\n",
    "        labels.append(label)\n",
    "\n",
    "    return texts, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 72
    },
    "colab_type": "code",
    "id": "PLZajMt7qogm",
    "outputId": "52639b8d-47e1-4648-be0d-33de23cbc0c3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sample text:  with all this stuff going down at the moment with mj ive started listening to his music, watching the odd documentary here and there, watched the wiz and watched moonwalker again. maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. some of it has subtle messages about mjs feeling towards the press and also the obvious message of drugs are bad mkay.visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring. some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him.the actual feature film bit when it finally starts is only on for 20 minutes or so excluding the smooth criminal sequence and joe pesci is convincing as a psychopathic all powerful drug lord. why he wants mj dead so bad is beyond me. because mj overheard his plans? nah, joe pescis character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates mjs music.lots of cool things in this like mj turning into a car and a robot and the whole speed demon sequence. also, the director must have had the patience of a saint when it came to filming the kiddy bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.bottom line, this movie is for people who like mj on one level or another (which i think is most people). if not, then stay away. it does try and give off a wholesome message and ironically mjs bestest buddy in this movie is a girl! michael jackson is truly one of the most talented people ever to grace this planet but is he guilty? well, with all the attention ive gave this subject....hmmm well i dont know because people can be different behind closed doors, i know this for a fact. he is either an extremely nice but stupid guy or one of the most sickest liars. i hope he is not the latter.\n",
      "corresponding label: 1\n"
     ]
    }
   ],
   "source": [
    "text_col = 'review'\n",
    "label_col = 'sentiment'\n",
    "texts, labels = clean_text(df, text_col, label_col)\n",
    "print('sample text: ', texts[0])\n",
    "print('corresponding label:', labels[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 69
    },
    "colab_type": "code",
    "id": "1be5IZyyJW68",
    "outputId": "1660a109-3ef2-4601-b724-23559b1b221f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "labels shape: (25000, 2)\n",
      "train size:  20000\n",
      "validation size:  5000\n"
     ]
    }
   ],
   "source": [
    "random_state = 1234\n",
    "val_split = 0.2\n",
    "\n",
    "labels = to_categorical(labels)\n",
    "texts_train, texts_val, y_train, y_val = train_test_split(\n",
    "    texts, labels,\n",
    "    test_size=val_split,\n",
    "    random_state=random_state)\n",
    "\n",
    "print('labels shape:', labels.shape)\n",
    "print('train size: ', len(texts_train))\n",
    "print('validation size: ', len(texts_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "colab_type": "code",
    "id": "11xqximiJXxI",
    "outputId": "854152f5-e0ad-4e08-c3cd-4fcbd99e6a54"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 74207 unique tokens.\n"
     ]
    }
   ],
   "source": [
    "max_num_words = 20000\n",
    "\n",
    "tokenizer = Tokenizer(num_words=max_num_words, oov_token='<unk>')\n",
    "tokenizer.fit_on_texts(texts_train)\n",
    "print('Found %s unique tokens.' % len(tokenizer.word_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "colab_type": "code",
    "id": "cAQq2V1rJZXP",
    "outputId": "0f68cb74-7342-404e-d15a-9262e92b5714"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[50, 1, 863, 2, 17]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_sequence_len = 1000\n",
    "\n",
    "sequences_train = tokenizer.texts_to_sequences(texts_train)\n",
    "x_train = pad_sequences(sequences_train, maxlen=max_sequence_len)\n",
    "\n",
    "sequences_val = tokenizer.texts_to_sequences(texts_val)\n",
    "x_val = pad_sequences(sequences_val, maxlen=max_sequence_len)\n",
    "\n",
    "sequences_train[0][:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "hMVx5_mkqogv"
   },
   "source": [
    "## Glove"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "b4b4aBL6qogw"
   },
   "source": [
    "There are many different pretrained word embeddings online. The one we'll be using is from [Glove](https://nlp.stanford.edu/projects/glove/). Others include but not limited to [FastText](https://fasttext.cc/docs/en/crawl-vectors.html), [bpemb](https://github.com/bheinzerling/bpemb).\n",
    "\n",
    "If we look at the project's wiki page, we can find any different pretrained embeddings available for us to experiment.\n",
    "\n",
    "<img src=\"img/pretrained_weights.png\" width=\"100%\" height=\"100%\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "rRq_oIOoqogy"
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "from tqdm import tqdm\n",
    "\n",
    "def download_glove(embedding_type: str='glove.6B.zip'):\n",
    "    \"\"\"\n",
    "    download GloVe word vector representations, this step may take a while\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    embedding_type : str, default 'glove.6B.zip'\n",
    "        Specifying different glove embeddings to download if not already there.\n",
    "        {'glove.6B.zip', 'glove.42B.300d.zip', 'glove.840B.300d.zip', 'glove.twitter.27B.zip'}\n",
    "        Be wary of the size. e.g. 'glove.6B.zip' is a 822 MB zipped, 2GB unzipped\n",
    "    \"\"\"\n",
    "\n",
    "    base_url = 'http://nlp.stanford.edu/data/'\n",
    "    if not os.path.isfile(embedding_type):\n",
    "        url = base_url + embedding_type\n",
    "\n",
    "        # the following section is a pretty generic http get request for\n",
    "        # saving large files, provides progress bars for checking progress\n",
    "        response = requests.get(url, stream=True)\n",
    "        response.raise_for_status()\n",
    "\n",
    "        content_len = response.headers.get('Content-Length')\n",
    "        total = int(content_len) if content_len is not None else 0\n",
    "\n",
    "        with tqdm(unit='B', total=total) as pbar, open(embedding_type, 'wb') as f:\n",
    "            for chunk in response.iter_content(chunk_size=1024):\n",
    "                if chunk:\n",
    "                    pbar.update(len(chunk))\n",
    "                    f.write(chunk)\n",
    "\n",
    "        if response.headers.get('Content-Type') == 'application/zip':\n",
    "            from zipfile import ZipFile\n",
    "            with ZipFile(embedding_type, 'r') as f:\n",
    "                f.extractall(embedding_type.strip('.zip'))\n",
    "\n",
    "\n",
    "download_glove()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "AvnHzVFWqog1"
   },
   "source": [
    "The way we'll leverage the pretrained embedding is to first read it in as a dictionary lookup, where the key is the word and the value is the corresponding word embedding. Then for each token in our vocabulary, we'll lookup this dictionary to see if there's a pretrained embedding available, if there is, we'll use the pretrained embedding, if there isn't, we'll leave the embedding for this word in its original randomly initialized form.\n",
    "\n",
    "The format for this particular pretrained embedding is for every line, we have a space delimited values, where the first token is the word, and the rest are its corresponding embedding values. e.g. the first line from the line looks like:\n",
    "\n",
    "```\n",
    "the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "LVqk-JdCqog1"
   },
   "outputs": [],
   "source": [
    "def get_embedding_lookup(embedding_path) -> Dict[str, np.ndarray]:\n",
    "    embedding_lookup = {}\n",
    "    with open(embedding_path) as f:\n",
    "        for line in f:\n",
    "            values = line.split()\n",
    "            word = values[0]\n",
    "            coef = np.array(values[1:], dtype=np.float32)\n",
    "            embedding_lookup[word] = coef\n",
    "\n",
    "    return embedding_lookup\n",
    "\n",
    "\n",
    "def get_pretrained_embedding(embedding_path: str,\n",
    "                             index2word: Dict[int, str],\n",
    "                             max_features: int) -> np.ndarray:\n",
    "    embedding_lookup = get_embedding_lookup(embedding_path)\n",
    "\n",
    "    pretrained_embedding = np.stack(list(embedding_lookup.values()))\n",
    "    embedding_dim = pretrained_embedding.shape[1]\n",
    "    embeddings = np.random.normal(pretrained_embedding.mean(),\n",
    "                                  pretrained_embedding.std(),\n",
    "                                  (max_features, embedding_dim)).astype(np.float32)\n",
    "    # we track how many tokens in our vocabulary exists in the pre-trained embedding,\n",
    "    # i.e. how many tokens has a pre-trained embedding from this particular file\n",
    "    n_found = 0\n",
    "    \n",
    "    # the loop starts from 1 due to keras' Tokenizer reserves 0 for padding index\n",
    "    for i in range(1, max_features):\n",
    "        word = index2word[i]\n",
    "        embedding_vector = embedding_lookup.get(word)\n",
    "        if embedding_vector is not None:\n",
    "            embeddings[i] = embedding_vector\n",
    "            n_found += 1\n",
    "\n",
    "    print('number of words found:', n_found)\n",
    "    return embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 52
    },
    "colab_type": "code",
    "id": "3OY9akajqog4",
    "outputId": "f904a1ab-805e-4942-bf7a-a9fd5cfc0f8a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of words found: 19654\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(20001, 100)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glove_path = os.path.join('glove.6B', 'glove.6B.100d.txt')\n",
    "max_features = max_num_words + 1\n",
    "\n",
    "pretrained_embedding = get_pretrained_embedding(glove_path, tokenizer.index_word, max_features)\n",
    "pretrained_embedding.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "hvs-IEcXqog6"
   },
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "4x7VI20X1Cm8"
   },
   "source": [
    "To train our text classifier, we specify a 1D convolutional network. Our embedding layer can either be initialized randomly or loaded from a pre-trained embedding. Note that for the pre-trained embedding case, apart from loading the weights, we also \"freeze\" the embedding layer, i.e. we set its trainable attribute to False. This idea is often times used in transfer learning, where when parts of a model are pre-trained (in our case, only our Embedding layer), and parts of it are randomly initialized, the pre-trained part should ideally not be trained together with the randomly initialized part. The rationale behind it is that a large gradient update triggered by the randomly initialized layer would become very disruptive to those pre-trained weights.\n",
    "\n",
    "Once we train the randomly initialized weights for a few iterations, we can then go about un-freezing the layers that were loaded with pre-trained weights, and do an update on the weight for the entire thing. The [keras documentation](https://keras.io/applications/#fine-tune-inceptionv3-on-a-new-set-of-classes) also provides an example of how to do this, although the example is for image models, the same idea can also be applied here, and can be something that's worth experimenting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "jp_qfjtkxSDZ"
   },
   "outputs": [],
   "source": [
    "def simple_text_cnn(max_sequence_len: int,\n",
    "                    max_features: int,\n",
    "                    num_classes: int,\n",
    "                    optimizer: str='adam',\n",
    "                    metrics: List[str]=['acc'],\n",
    "                    pretrained_embedding: np.ndarray=None) -> Model:\n",
    "\n",
    "    sequence_input = layers.Input(shape=(max_sequence_len,), dtype='int32')\n",
    "    if pretrained_embedding is None:\n",
    "        embedded_sequences = layers.Embedding(max_features, 100,\n",
    "                                              name='embedding')(sequence_input)\n",
    "    else:\n",
    "        embedded_sequences = layers.Embedding(max_features, pretrained_embedding.shape[1],\n",
    "                                              weights=[pretrained_embedding],\n",
    "                                              name='embedding',\n",
    "                                              trainable=False)(sequence_input)\n",
    "\n",
    "    conv1 = layers.Conv1D(128, 5, activation='relu')(embedded_sequences)\n",
    "    pool1 = layers.MaxPooling1D(5)(conv1)\n",
    "    conv2 = layers.Conv1D(128, 5, activation='relu')(pool1)\n",
    "    pool2 = layers.MaxPooling1D(5)(conv2)\n",
    "    conv3 = layers.Conv1D(128, 5, activation='relu')(pool2)\n",
    "    pool3 = layers.MaxPooling1D(35)(conv3)\n",
    "    flatten = layers.Flatten()(pool3)\n",
    "    dense = layers.Dense(128, activation='relu')(flatten)\n",
    "    preds = layers.Dense(num_classes, activation='softmax')(dense)\n",
    "\n",
    "    model = Model(sequence_input, preds)\n",
    "    model.compile(loss='categorical_crossentropy',\n",
    "                  optimizer=optimizer,\n",
    "                  metrics=metrics)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "tnPIKW75qog7"
   },
   "source": [
    "### Model with Pretrained Embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 970
    },
    "colab_type": "code",
    "id": "oWBm-9wLqog-",
    "outputId": "20d73b5a-2b4f-4bbe-fbb2-66bd38f31523"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:203: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:207: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:216: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:223: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4267: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3576: The name tf.log is deprecated. Please use tf.math.log instead.\n",
      "\n",
      "Model: \"model_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, 1000)              0         \n",
      "_________________________________________________________________\n",
      "embedding (Embedding)        (None, 1000, 100)         2000100   \n",
      "_________________________________________________________________\n",
      "conv1d_1 (Conv1D)            (None, 996, 128)          64128     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         \n",
      "_________________________________________________________________\n",
      "conv1d_2 (Conv1D)            (None, 195, 128)          82048     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_3 (Conv1D)            (None, 35, 128)           82048     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 128)               16512     \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 2)                 258       \n",
      "=================================================================\n",
      "Total params: 2,245,094\n",
      "Trainable params: 244,994\n",
      "Non-trainable params: 2,000,100\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "num_classes = 2\n",
    "model1 = simple_text_cnn(max_sequence_len, max_features, num_classes,\n",
    "                         pretrained_embedding=pretrained_embedding)\n",
    "model1.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "7Jp2cS-sOe_d"
   },
   "source": [
    "We can confirm whether our embedding layer is trainable by looping through each layer and checking the trainable attribute."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 378
    },
    "colab_type": "code",
    "id": "U1nmAdZyqohA",
    "outputId": "44dee935-7ff2-4f91-f78e-b08a6bfb3aea"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>layer</th>\n",
       "      <th>trainable</th>\n",
       "      <th>n_params</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>input_1</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>embedding</td>\n",
       "      <td>False</td>\n",
       "      <td>2000100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>conv1d_1</td>\n",
       "      <td>True</td>\n",
       "      <td>64128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>max_pooling1d_1</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>conv1d_2</td>\n",
       "      <td>True</td>\n",
       "      <td>82048</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>max_pooling1d_2</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>conv1d_3</td>\n",
       "      <td>True</td>\n",
       "      <td>82048</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>max_pooling1d_3</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>flatten_1</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>dense_1</td>\n",
       "      <td>True</td>\n",
       "      <td>16512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>dense_2</td>\n",
       "      <td>True</td>\n",
       "      <td>258</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              layer  trainable  n_params\n",
       "0           input_1      False         0\n",
       "1         embedding      False   2000100\n",
       "2          conv1d_1       True     64128\n",
       "3   max_pooling1d_1       True         0\n",
       "4          conv1d_2       True     82048\n",
       "5   max_pooling1d_2       True         0\n",
       "6          conv1d_3       True     82048\n",
       "7   max_pooling1d_3       True         0\n",
       "8         flatten_1       True         0\n",
       "9           dense_1       True     16512\n",
       "10          dense_2       True       258"
      ]
     },
     "execution_count": 18,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_model_layers = pd.DataFrame(\n",
    "    [(layer.name, layer.trainable, layer.count_params()) for layer in model1.layers],\n",
    "    columns=['layer', 'trainable', 'n_params']\n",
    ")\n",
    "df_model_layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 469
    },
    "colab_type": "code",
    "id": "LiKOKRG8qohF",
    "outputId": "6479f341-278b-427f-e803-949c3b1e1951"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1020: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.\n",
      "\n",
      "Train on 20000 samples, validate on 5000 samples\n",
      "Epoch 1/8\n",
      "20000/20000 [==============================] - 12s 604us/step - loss: 0.5854 - acc: 0.6748 - val_loss: 0.4772 - val_acc: 0.7808\n",
      "Epoch 2/8\n",
      "20000/20000 [==============================] - 8s 416us/step - loss: 0.4001 - acc: 0.8186 - val_loss: 0.3766 - val_acc: 0.8352\n",
      "Epoch 3/8\n",
      "20000/20000 [==============================] - 8s 414us/step - loss: 0.3428 - acc: 0.8507 - val_loss: 0.4276 - val_acc: 0.7966\n",
      "Epoch 4/8\n",
      "20000/20000 [==============================] - 8s 415us/step - loss: 0.2790 - acc: 0.8842 - val_loss: 0.3433 - val_acc: 0.8594\n",
      "Epoch 5/8\n",
      "20000/20000 [==============================] - 8s 415us/step - loss: 0.2469 - acc: 0.8987 - val_loss: 0.4015 - val_acc: 0.8310\n",
      "Epoch 6/8\n",
      "20000/20000 [==============================] - 8s 420us/step - loss: 0.1782 - acc: 0.9289 - val_loss: 0.4670 - val_acc: 0.8296\n",
      "Epoch 7/8\n",
      "20000/20000 [==============================] - 8s 419us/step - loss: 0.1017 - acc: 0.9643 - val_loss: 0.5965 - val_acc: 0.8146\n",
      "Epoch 8/8\n",
      "20000/20000 [==============================] - 8s 418us/step - loss: 0.0680 - acc: 0.9758 - val_loss: 0.6876 - val_acc: 0.8332\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "70.9072277545929"
      ]
     },
     "execution_count": 19,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# time : 70\n",
    "# test performance : auc 0.93212\n",
    "start = time.time()\n",
    "history1 = model1.fit(x_train, y_train,\n",
    "                      validation_data=(x_val, y_val),\n",
    "                      batch_size=128,\n",
    "                      epochs=8)\n",
    "end = time.time()\n",
    "elapse1 = end - start\n",
    "elapse1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "gJ1kLS0kqohI"
   },
   "source": [
    "### Model without Pretrained Embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 535
    },
    "colab_type": "code",
    "id": "Gqv8kd_OqohJ",
    "outputId": "2169b744-6177-4a0e-80a8-194d41bbfe96"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model_2\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_2 (InputLayer)         (None, 1000)              0         \n",
      "_________________________________________________________________\n",
      "embedding (Embedding)        (None, 1000, 100)         2000100   \n",
      "_________________________________________________________________\n",
      "conv1d_4 (Conv1D)            (None, 996, 128)          64128     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_4 (MaxPooling1 (None, 199, 128)          0         \n",
      "_________________________________________________________________\n",
      "conv1d_5 (Conv1D)            (None, 195, 128)          82048     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_6 (Conv1D)            (None, 35, 128)           82048     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_6 (MaxPooling1 (None, 1, 128)            0         \n",
      "_________________________________________________________________\n",
      "flatten_2 (Flatten)          (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 128)               16512     \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 2)                 258       \n",
      "=================================================================\n",
      "Total params: 2,245,094\n",
      "Trainable params: 2,245,094\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "num_classes = 2\n",
    "model2 = simple_text_cnn(max_sequence_len, max_features, num_classes)\n",
    "model2.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 348
    },
    "colab_type": "code",
    "id": "S-R6GRK2qohM",
    "outputId": "77b6284a-0621-4263-d99f-6d3fa9d21cad"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 20000 samples, validate on 5000 samples\n",
      "Epoch 1/8\n",
      "20000/20000 [==============================] - 11s 570us/step - loss: 0.5010 - acc: 0.7065 - val_loss: 0.3016 - val_acc: 0.8730\n",
      "Epoch 2/8\n",
      "20000/20000 [==============================] - 11s 542us/step - loss: 0.2024 - acc: 0.9243 - val_loss: 0.2816 - val_acc: 0.8824\n",
      "Epoch 3/8\n",
      "20000/20000 [==============================] - 11s 538us/step - loss: 0.0806 - acc: 0.9734 - val_loss: 0.3552 - val_acc: 0.8812\n",
      "Epoch 4/8\n",
      "20000/20000 [==============================] - 11s 535us/step - loss: 0.0272 - acc: 0.9917 - val_loss: 0.4671 - val_acc: 0.8836\n",
      "Epoch 5/8\n",
      "20000/20000 [==============================] - 11s 543us/step - loss: 0.0088 - acc: 0.9973 - val_loss: 0.6534 - val_acc: 0.8788\n",
      "Epoch 6/8\n",
      "20000/20000 [==============================] - 11s 542us/step - loss: 0.0090 - acc: 0.9973 - val_loss: 0.7522 - val_acc: 0.8740\n",
      "Epoch 7/8\n",
      "20000/20000 [==============================] - 11s 542us/step - loss: 0.0104 - acc: 0.9967 - val_loss: 1.0453 - val_acc: 0.8480\n",
      "Epoch 8/8\n",
      "20000/20000 [==============================] - 11s 543us/step - loss: 0.0205 - acc: 0.9924 - val_loss: 0.6930 - val_acc: 0.8712\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "87.63022541999817"
      ]
     },
     "execution_count": 21,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# time : 86 secs\n",
    "# test performance : auc 0.92310\n",
    "start = time.time()\n",
    "history1 = model2.fit(x_train, y_train,\n",
    "                      validation_data=(x_val, y_val),\n",
    "                      batch_size=128,\n",
    "                      epochs=8)\n",
    "end = time.time()\n",
    "elapse1 = end - start\n",
    "elapse1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "adM0Y3NDqohN"
   },
   "source": [
    "## Submission"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the submission section, we read in and preprocess the test data provided by the competition, then generate the predicted probability column for both the model that uses pretrained embedding and one that doesn't to compare their performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 215
    },
    "colab_type": "code",
    "id": "rcIJtyixqohO",
    "outputId": "a075129d-8870-4184-93dc-460ef9f53df4"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>Naturally in a film who's main themes are of m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>This movie is a disaster within a disaster fil...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>All in all, this is a movie for kids. We saw i...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>Afraid of the Dark left me with the impression...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>A very accurate depiction of small time mob li...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id                                             review\n",
       "0  12311_10  Naturally in a film who's main themes are of m...\n",
       "1    8348_2  This movie is a disaster within a disaster fil...\n",
       "2    5828_4  All in all, this is a movie for kids. We saw i...\n",
       "3    7186_2  Afraid of the Dark left me with the impression...\n",
       "4   12128_7  A very accurate depiction of small time mob li..."
      ]
     },
     "execution_count": 22,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_path = os.path.join(data_dir, 'word2vec-nlp-tutorial', 'testData.tsv')\n",
    "df_test = pd.read_csv(input_path, delimiter='\\t')\n",
    "print(df_test.shape)\n",
    "df_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "n4hAmBIoqohQ"
   },
   "outputs": [],
   "source": [
    "def clean_text_without_label(df: pd.DataFrame, text_col: str) -> List[str]:\n",
    "    texts = []\n",
    "    for raw_text in df[text_col]:\n",
    "        text = BeautifulSoup(raw_text).get_text()\n",
    "        cleaned_text = clean_str(text)\n",
    "        texts.append(cleaned_text)\n",
    "\n",
    "    return texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "colab_type": "code",
    "id": "Pb8Q282IqohS",
    "outputId": "254057f0-24df-4162-9608-6ac2abd7dc0f"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 25,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts_test = clean_text_without_label(df_test, text_col)\n",
    "sequences_test = tokenizer.texts_to_sequences(texts_test)\n",
    "x_test = pad_sequences(sequences_test, maxlen=max_sequence_len)\n",
    "len(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "tHLsmac7qohU"
   },
   "outputs": [],
   "source": [
    "def create_submission(ids, predictions, ids_col, prediction_col, submission_path) -> pd.DataFrame:\n",
    "    df_submission = pd.DataFrame({\n",
    "        ids_col: ids,\n",
    "        prediction_col: predictions\n",
    "    }, columns=[ids_col, prediction_col])\n",
    "\n",
    "    if submission_path is not None:\n",
    "        # create the directory if need be, e.g. if the submission_path = submission/submission.csv\n",
    "        # we'll create the submission directory first if it doesn't exist\n",
    "        directory = os.path.split(submission_path)[0]\n",
    "        if (directory != '' or directory != '.') and not os.path.isdir(directory):\n",
    "            os.makedirs(directory, exist_ok=True)\n",
    "\n",
    "        df_submission.to_csv(submission_path, index=False, header=True)\n",
    "\n",
    "    return df_submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 285
    },
    "colab_type": "code",
    "id": "d8uQtCksqohW",
    "outputId": "0c454223-bae6-42c8-9505-6e7d15f8e25c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "generating submission for:  pretrained_embedding\n",
      "25000/25000 [==============================] - 6s 228us/step\n",
      "generating submission for:  without_pretrained_embedding\n",
      "25000/25000 [==============================] - 6s 222us/step\n",
      "(25000, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>0.005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>0.071</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>1.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  sentiment\n",
       "0  12311_10      1.000\n",
       "1    8348_2      0.000\n",
       "2    5828_4      0.005\n",
       "3    7186_2      0.071\n",
       "4   12128_7      1.000"
      ]
     },
     "execution_count": 27,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ids_col = 'id'\n",
    "prediction_col = 'sentiment'\n",
    "ids = df_test[ids_col]\n",
    "\n",
    "models = {\n",
    "    'pretrained_embedding': model1,\n",
    "    'without_pretrained_embedding': model2\n",
    "}\n",
    "\n",
    "for model_name, model in models.items():\n",
    "    print('generating submission for: ', model_name)\n",
    "    submission_path = os.path.join(submission_dir, '{}_submission.csv'.format(model_name))\n",
    "    predictions = model.predict(x_test, verbose=1)[:, 1]\n",
    "    df_submission = create_submission(ids, predictions, ids_col, prediction_col, submission_path)\n",
    "\n",
    "# sanity check to make sure the size and the output of the submission makes sense\n",
    "print(df_submission.shape)\n",
    "df_submission.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "74AbmuLXN2N8"
   },
   "source": [
    "## Summary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "9gxwkBfwNyxA"
   },
   "source": [
    "In this article, we took a look at how to leverage pre-trained word embeddings for our text classification task. There're also various Kaggle Kernels [here](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings) and [here](https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge) that experiments whether different pre-trained embeddings or even an ensemble of models each with a different pre-trained embedding on various text classification tasks to see if it gives us an edge. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "6YtMHPyhqohY"
   },
   "source": [
    "# Reference"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "_y1PumdxqohY"
   },
   "source": [
    "- [Blog: Text Classification, Part I - Convolutional Networks](https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/)\n",
    "- [Blog: Using pre-trained word embeddings in a Keras model](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)\n",
    "- [Jupyter Notebook - Deep Learning with Python - Using Word Embeddings](https://nbviewer.jupyter.org/github/fchollet/deep-learning-with-python-notebooks/blob/master/6.1-using-word-embeddings.ipynb)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "keras_pretrained_embedding.ipynb",
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "296px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}