{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.5.4"
    },
    "colab": {
      "name": "kl_Recurrent_Neural_Network.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/klajosw/python/blob/master/kl_Recurrent_Neural_Network.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mAXdHLsmCtC9",
        "colab_type": "text"
      },
      "source": [
        "#  Part 1:  Recurrent Neural Network "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SfdIvQcyCtC_",
        "colab_type": "text"
      },
      "source": [
        "###  Importing packages"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rgd8svPhCtDA",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import re\n",
        "import numpy as np\n",
        "import pandas as pd \n",
        "\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "from keras.preprocessing import sequence\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from keras.preprocessing.sequence import pad_sequences\n",
        "from keras.models import Sequential\n",
        "from keras.layers import Dense, Embedding, LSTM\n",
        "from keras.datasets import imdb\n",
        "\n",
        "from keras.utils.np_utils import to_categorical\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "import ssl\n",
        "ssl._create_default_https_context = ssl._create_unverified_context"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sWmzIsFjCtDG",
        "colab_type": "text"
      },
      "source": [
        "### Preparing Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TE1lgX_nCtDH",
        "colab_type": "code",
        "outputId": "43c7b68e-d7f5-4490-cde9-e99b57a133ea",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 163
        }
      },
      "source": [
        "max_features = 1000\n",
        "maxlen = 80  # cut texts after this number of words (among top max_features most common words)\n",
        "batch_size = 32\n",
        "\n",
        "print('Loading data...')\n",
        "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n",
        "print(len(x_train), 'train sequences')\n",
        "print(len(x_test), 'test sequences')\n",
        "\n",
        "print('Pad sequences (samples x time)')\n",
        "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
        "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
        "print('x_train shape:', x_train.shape)\n",
        "print('x_test shape:', x_test.shape)"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Loading data...\n",
            "Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz\n",
            "17465344/17464789 [==============================] - 1s 0us/step\n",
            "25000 train sequences\n",
            "25000 test sequences\n",
            "Pad sequences (samples x time)\n",
            "x_train shape: (25000, 80)\n",
            "x_test shape: (25000, 80)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RrX2lqw1CtDO",
        "colab_type": "text"
      },
      "source": [
        "### Visualize the data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GV296kh6CtDP",
        "colab_type": "code",
        "outputId": "5d97985c-35a6-4904-c024-a684c9b488da",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 92
        }
      },
      "source": [
        "INDEX_FROM=3   # word index offset\n",
        "\n",
        "word_to_id = imdb.get_word_index()\n",
        "word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}\n",
        "word_to_id[\"<PAD>\"] = 0\n",
        "word_to_id[\"<START>\"] = 1\n",
        "word_to_id[\"<UNK>\"] = 2\n",
        "\n",
        "id_to_word = {value:key for key,value in word_to_id.items()}\n",
        "print(' '.join(id_to_word[id] for id in x_train[0] ))"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json\n",
            "1646592/1641221 [==============================] - 1s 0us/step\n",
            "that played the <UNK> of <UNK> and paul they were just brilliant children are often left out of the <UNK> <UNK> i think because the stars that play them all <UNK> up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they have done don't you think the whole story was so <UNK> because it was true and was <UNK> life after all that was <UNK> with us all\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vIwm7DbuCtDU",
        "colab_type": "text"
      },
      "source": [
        "### Building a Model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "scrolled": true,
        "id": "LB4JVgYJCtDW",
        "colab_type": "code",
        "outputId": "1dcc3958-ebbf-489c-9763-d6678c60db5b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 319
        }
      },
      "source": [
        "print('Build model...')\n",
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 8))\n",
        "model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Build model...\n",
            "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "error",
          "ename": "NameError",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-2-7601b68ed35e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Build model...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSequential\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mEmbedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m8\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLSTM\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m16\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecurrent_dropout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mactivation\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'sigmoid'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mNameError\u001b[0m: name 'max_features' is not defined"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fi0cRt-QCtDa",
        "colab_type": "text"
      },
      "source": [
        "### Model Training"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uSQxXg7eCtDd",
        "colab_type": "code",
        "outputId": "70fcf2fd-20a1-4e06-fe97-4f091ecff01f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 237
        }
      },
      "source": [
        "# try using different optimizers and different optimizer configs\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "\n",
        "# Write the training input and output, batch size, and testing input and output\n",
        "\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
            "\n",
            "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead.\n",
            "\n",
            "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
            "Instructions for updating:\n",
            "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 52s 2ms/step - loss: 0.5450 - acc: 0.7235 - val_loss: 0.4540 - val_acc: 0.7929\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7fc2daaf4c18>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "viOMzEbKCtDh",
        "colab_type": "text"
      },
      "source": [
        "### Testing"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2u_ADdgVCtDi",
        "colab_type": "code",
        "outputId": "68ad9d17-9dd0-45b8-c7d3-d6ea772ec261",
        "colab": {}
      },
      "source": [
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "24960/25000 [============================>.] - ETA: 0sTest score: 0.49805993225097656\n",
            "Test accuracy: 0.75544\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9q3EWD4nCtDn",
        "colab_type": "text"
      },
      "source": [
        "### Prediction"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9qk8CohCCtDo",
        "colab_type": "code",
        "outputId": "707c8b8d-49c7-4b9c-c912-3ecc589e4656",
        "colab": {}
      },
      "source": [
        "prediction = model.predict(x_test[2:3])\n",
        "print('Prediction value:',prediction[0])\n",
        "print('Test Label:',y_test[2:3])\n",
        "print(' '.join(id_to_word[id] for id in x_test[25] ))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Prediction value: [0.82189775]\n",
            "Test Label: [1]\n",
            "<UNK> that should be <UNK> viewing for all <UNK> <UNK> has its <UNK> as well but for other than <UNK> reason <UNK> today is a <UNK> example of the left in full <UNK> <UNK> <UNK> <UNK> <UNK> and given the times <UNK> <UNK> <UNK> the <UNK> 7 such <UNK> <UNK> seemed not that great a <UNK> from the truth but <UNK> years later the <UNK> has <UNK> and <UNK> <UNK> with it's <UNK> <UNK> <UNK> is a pretty silly <UNK>\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fwbxUlIeCtDs",
        "colab_type": "text"
      },
      "source": [
        "### Other RNN Layers\n",
        "\n",
        "* keras.layers.RNN(cell, return_sequences=False)\n",
        "* keras.layers.SimpleRNN(units, activation='tanh')\n",
        "* keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')\n",
        "* keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )\n",
        "* keras.layers.SimpleRNNCell(units, activation='tanh')\n",
        "* keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')\n",
        "* keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')\n",
        "* keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')\n",
        "* keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nipqdbdwCtDu",
        "colab_type": "text"
      },
      "source": [
        "# Part 2: RNN Design Choices"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oj3P_6JdCtDw",
        "colab_type": "text"
      },
      "source": [
        "## Influence of number of nodes"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9sHR_d4TCtDx",
        "colab_type": "text"
      },
      "source": [
        "### LSTM with 8 nodes"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yTU64BjqCtDz",
        "colab_type": "code",
        "outputId": "7aa68d78-1e70-4bc2-de31-2e53d6436616",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 8))\n",
        "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_3 (Embedding)      (None, None, 8)           8000      \n",
            "_________________________________________________________________\n",
            "lstm_3 (LSTM)                (None, 8)                 544       \n",
            "_________________________________________________________________\n",
            "dense_3 (Dense)              (None, 1)                 9         \n",
            "=================================================================\n",
            "Total params: 8,553\n",
            "Trainable params: 8,553\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 69s - loss: 0.5377 - acc: 0.7186 - val_loss: 0.4416 - val_acc: 0.7936\n",
            "24960/25000 [============================>.] - ETA: 0sTest score: 0.441626269493103\n",
            "Test accuracy: 0.79364\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wFRHd006CtD3",
        "colab_type": "text"
      },
      "source": [
        "### LSTM with 16 nodes"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CHjkgOD4CtD4",
        "colab_type": "code",
        "outputId": "37d5ea5a-164b-4157-fb52-129e9634e97e",
        "colab": {}
      },
      "source": [
        "#not tried\n",
        "# Write your code here \n",
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 8))\n",
        "model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)\n",
        "# Use the same layer design from the above cell "
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_4 (Embedding)      (None, None, 8)           8000      \n",
            "_________________________________________________________________\n",
            "lstm_4 (LSTM)                (None, 16)                1600      \n",
            "_________________________________________________________________\n",
            "dense_4 (Dense)              (None, 1)                 17        \n",
            "=================================================================\n",
            "Total params: 9,617\n",
            "Trainable params: 9,617\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 70s - loss: 0.5149 - acc: 0.7333 - val_loss: 0.4129 - val_acc: 0.8124\n",
            "24960/25000 [============================>.] - ETA: 0sTest score: 0.4128888432312012\n",
            "Test accuracy: 0.81236\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vCnAs1gsCtD9",
        "colab_type": "text"
      },
      "source": [
        "## Influence of Embedding"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wXBRbnHKCtD-",
        "colab_type": "code",
        "outputId": "254004d8-17ce-4d6a-9abf-96ef91e2d354",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 4))\n",
        "model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_5 (Embedding)      (None, None, 4)           4000      \n",
            "_________________________________________________________________\n",
            "lstm_5 (LSTM)                (None, 16)                1344      \n",
            "_________________________________________________________________\n",
            "dense_5 (Dense)              (None, 1)                 17        \n",
            "=================================================================\n",
            "Total params: 5,361\n",
            "Trainable params: 5,361\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 66s - loss: 0.5176 - acc: 0.7263 - val_loss: 0.4116 - val_acc: 0.8124\n",
            "24960/25000 [============================>.] - ETA: 0sTest score: 0.41163202223777773\n",
            "Test accuracy: 0.81236\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0rGDKWtnCtEC",
        "colab_type": "text"
      },
      "source": [
        "## Influence of Dropout"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7kfBqNM8CtED",
        "colab_type": "text"
      },
      "source": [
        "### Dropout with probability 0.5"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hcqQZmlrCtEE",
        "colab_type": "code",
        "outputId": "7c26da62-a01a-459b-9f0f-7f1eb50e28ff",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 32))\n",
        "model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_6 (Embedding)      (None, None, 32)          32000     \n",
            "_________________________________________________________________\n",
            "lstm_6 (LSTM)                (None, 8)                 1312      \n",
            "_________________________________________________________________\n",
            "dense_6 (Dense)              (None, 1)                 9         \n",
            "=================================================================\n",
            "Total params: 33,321\n",
            "Trainable params: 33,321\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 74s - loss: 0.6050 - acc: 0.6698 - val_loss: 0.5219 - val_acc: 0.7405\n",
            "24896/25000 [============================>.] - ETA: 0sTest score: 0.521893192024231\n",
            "Test accuracy: 0.74052\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i5QNJ37XCtEJ",
        "colab_type": "text"
      },
      "source": [
        "### Dropout with probability 0.9"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JwyDaT9eCtEK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Write your code here \n",
        "\n",
        "# Use the same model design from the above cell "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CCRL0v0ECtEP",
        "colab_type": "text"
      },
      "source": [
        "## Multilayered RNNs"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rXImrGnLCtEQ",
        "colab_type": "text"
      },
      "source": [
        "### RNN with 2 layer LSTM"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G-PwWdhbCtER",
        "colab_type": "code",
        "outputId": "bba1c799-e1ee-4661-fd58-0e5b4ec46bcb",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 8))\n",
        "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))\n",
        "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_7 (Embedding)      (None, None, 8)           8000      \n",
            "_________________________________________________________________\n",
            "lstm_7 (LSTM)                (None, None, 8)           544       \n",
            "_________________________________________________________________\n",
            "lstm_8 (LSTM)                (None, 8)                 544       \n",
            "_________________________________________________________________\n",
            "dense_7 (Dense)              (None, 1)                 9         \n",
            "=================================================================\n",
            "Total params: 9,097\n",
            "Trainable params: 9,097\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 104s - loss: 0.5088 - acc: 0.7404 - val_loss: 0.4145 - val_acc: 0.8114\n",
            "24928/25000 [============================>.] - ETA: 0sTest score: 0.4145219009399414\n",
            "Test accuracy: 0.81136\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ibb2ZoTkCtEW",
        "colab_type": "text"
      },
      "source": [
        "### RNN with 3 layer LSTM"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TIKdVfv_CtEY",
        "colab_type": "code",
        "outputId": "72715019-57e1-496d-a1fb-92415928f726",
        "colab": {}
      },
      "source": [
        "# Write your code here \n",
        "model = Sequential()\n",
        "model.add(Embedding(max_features, 8))\n",
        "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))\n",
        "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "model.summary()\n",
        "\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n",
        "\n",
        "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n",
        "print('Test score:', score)\n",
        "print('Test accuracy:', acc)\n",
        "# Use the same node design from the above cell "
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_8 (Embedding)      (None, None, 8)           8000      \n",
            "_________________________________________________________________\n",
            "lstm_9 (LSTM)                (None, None, 8)           544       \n",
            "_________________________________________________________________\n",
            "lstm_10 (LSTM)               (None, 8)                 544       \n",
            "_________________________________________________________________\n",
            "dense_8 (Dense)              (None, 1)                 9         \n",
            "=================================================================\n",
            "Total params: 9,097\n",
            "Trainable params: 9,097\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "Train on 25000 samples, validate on 25000 samples\n",
            "Epoch 1/1\n",
            "25000/25000 [==============================] - 107s - loss: 0.5471 - acc: 0.7128 - val_loss: 0.4698 - val_acc: 0.7706\n",
            "24960/25000 [============================>.] - ETA: 0sTest score: 0.4697502194404602\n",
            "Test accuracy: 0.7706\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RxBVbPCtCtEd",
        "colab_type": "text"
      },
      "source": [
        "### What are your findings?"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Lc_SiABPCtEf",
        "colab_type": "text"
      },
      "source": [
        "# Part 3: Recurrent Neural Network with Custom Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0HdkQ_Z9CtEk",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Credits to Peter Nagy"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xCc2eIIICtEq",
        "colab_type": "text"
      },
      "source": [
        "### Load data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DO7umetyCtEr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "data = pd.read_csv('Senti.csv')\n",
        "# Keeping only the neccessary columns\n",
        "data = data[['text','sentiment']]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "collapsed": true,
        "id": "NYgV00eECtEw",
        "colab_type": "text"
      },
      "source": [
        "### Visualize data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rmy-XJXFCtEx",
        "colab_type": "code",
        "outputId": "3d390731-26ea-431a-b882-bfaa036199e0",
        "colab": {}
      },
      "source": [
        "data.head(10)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>text</th>\n",
              "      <th>sentiment</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>I love this car</td>\n",
              "      <td>Positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>This view is amazing</td>\n",
              "      <td>Positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>I feel great this morning</td>\n",
              "      <td>Positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>I am so excited about the concert</td>\n",
              "      <td>Positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>He is my best friend</td>\n",
              "      <td>Positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>I do not like this car</td>\n",
              "      <td>Negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>This view is horrible</td>\n",
              "      <td>Negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>I feel tired this morning</td>\n",
              "      <td>Negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>I am not looking forward to the concert</td>\n",
              "      <td>Negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>He is my enemy</td>\n",
              "      <td>Negative</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                      text sentiment\n",
              "0                          I love this car  Positive\n",
              "1                     This view is amazing  Positive\n",
              "2                I feel great this morning  Positive\n",
              "3        I am so excited about the concert  Positive\n",
              "4                     He is my best friend  Positive\n",
              "5                   I do not like this car  Negative\n",
              "6                    This view is horrible  Negative\n",
              "7                I feel tired this morning  Negative\n",
              "8  I am not looking forward to the concert  Negative\n",
              "9                           He is my enemy  Negative"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wEikVF22CtE1",
        "colab_type": "text"
      },
      "source": [
        "### Format data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xJQLuBfCCtE3",
        "colab_type": "code",
        "outputId": "a62efb30-bc97-450f-9475-0cfae61c2db3",
        "colab": {}
      },
      "source": [
        "data = data[data.sentiment != \"Neutral\"]\n",
        "data['text'] = data['text'].apply(lambda x: x.lower())\n",
        "data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]','',x)))\n",
        "\n",
        "print('Number of positive samples:',data[ data['sentiment'] == 'Positive'].size)\n",
        "print('Number of negative samples:',data[ data['sentiment'] == 'Negative'].size)\n",
        "\n",
        "for idx,row in data.iterrows():\n",
        "    row[0] = row[0].replace('rt',' ')\n",
        "\n",
        "max_fatures = 2000\n",
        "tokenizer = Tokenizer(nb_words=max_fatures, split=' ')\n",
        "tokenizer.fit_on_texts(data['text'].values)\n",
        "X = tokenizer.texts_to_sequences(data['text'].values)\n",
        "X = pad_sequences(X)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Number of positive samples: 10\n",
            "Number of negative samples: 10\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HDCY0n9FCtE8",
        "colab_type": "text"
      },
      "source": [
        "### Training set"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FUeSohbaCtE9",
        "colab_type": "code",
        "outputId": "5139a773-388d-4222-a18f-8b996c9682e4",
        "colab": {}
      },
      "source": [
        "Y = pd.get_dummies(data['sentiment']).values\n",
        "X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)\n",
        "print('Shape of training samples:',X_train.shape,Y_train.shape)\n",
        "print('Shape of testing samples:',X_test.shape,Y_test.shape)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Shape of training samples: (6, 8) (6, 2)\n",
            "Shape of testing samples: (4, 8) (4, 2)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kLgxulzzCtFA",
        "colab_type": "text"
      },
      "source": [
        "### Design a model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Go3IKrztCtFB",
        "colab_type": "code",
        "outputId": "bb12eece-9692-4a3f-98e5-644d2c117265",
        "colab": {}
      },
      "source": [
        "model = Sequential()\n",
        "model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1], dropout=0.2))\n",
        "model.add(LSTM(128))\n",
        "model.add(Dense(2, activation='softmax'))\n",
        "model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])\n",
        "print(model.summary())"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "embedding_2 (Embedding)      (None, 8, 128)            256000    \n",
            "_________________________________________________________________\n",
            "lstm_2 (LSTM)                (None, 128)               131584    \n",
            "_________________________________________________________________\n",
            "dense_2 (Dense)              (None, 2)                 258       \n",
            "=================================================================\n",
            "Total params: 387,842\n",
            "Trainable params: 387,842\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aJQHBOZICtFF",
        "colab_type": "text"
      },
      "source": [
        "### Training "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mIfPTUFTCtFG",
        "colab_type": "code",
        "outputId": "57dacf7b-a303-4c24-ca22-513d40ae2130",
        "colab": {}
      },
      "source": [
        "batch_size = 32\n",
        "model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/5\n",
            "0s - loss: 0.6946 - acc: 0.3333\n",
            "Epoch 2/5\n",
            "0s - loss: 0.6864 - acc: 0.6667\n",
            "Epoch 3/5\n",
            "0s - loss: 0.6782 - acc: 0.6667\n",
            "Epoch 4/5\n",
            "0s - loss: 0.6698 - acc: 0.6667\n",
            "Epoch 5/5\n",
            "0s - loss: 0.6607 - acc: 0.6667\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<keras.callbacks.History at 0x7f887cbf3390>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OZrR_N4QCtFL",
        "colab_type": "text"
      },
      "source": [
        "### Validation"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IajDGeVSCtFM",
        "colab_type": "code",
        "outputId": "5e23de55-a0d2-4db7-b671-8b60b5628532",
        "colab": {}
      },
      "source": [
        "score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)\n",
        "print(\"Score: %.2f\" % (score))\n",
        "print(\"Accuracy: %.2f\" % (acc))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Score: 0.73\n",
            "Accuracy: 0.25\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ypTU6dTxCtFS",
        "colab_type": "text"
      },
      "source": [
        "### Formatting Test Example"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gJBsxFN1CtFT",
        "colab_type": "code",
        "outputId": "df201139-7ab8-456c-f164-82d50a3900e1",
        "colab": {}
      },
      "source": [
        "text = 'He is my enemy'\n",
        "tester = np.array([text])\n",
        "tester = pd.DataFrame(tester)\n",
        "tester.columns = ['text']\n",
        "\n",
        "tester['text'] = tester['text'].apply(lambda x: x.lower())\n",
        "tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]','',x)))\n",
        "\n",
        "max_fatures = 2000\n",
        "test = tokenizer.texts_to_sequences(tester['text'].values)\n",
        "test = pad_sequences(test)\n",
        "\n",
        "if X.shape[1]>test.shape[1]:\n",
        "    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')\n",
        "    \n",
        "test = np.array([test])\n",
        "\n",
        "prediction = model.predict(test)\n",
        "print('Prediction value:',prediction[0])"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Prediction value: [0.53419375 0.46580625]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1AcumMsRCtFX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}