{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.4" }, "colab": { "name": "kl_Recurrent_Neural_Network.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [], "include_colab_link": true } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "mAXdHLsmCtC9", "colab_type": "text" }, "source": [ "# Part 1: Recurrent Neural Network " ] }, { "cell_type": "markdown", "metadata": { "id": "SfdIvQcyCtC_", "colab_type": "text" }, "source": [ "### Importing packages" ] }, { "cell_type": "code", "metadata": { "id": "rgd8svPhCtDA", "colab_type": "code", "colab": {} }, "source": [ "import re\n", "import numpy as np\n", "import pandas as pd \n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.model_selection import train_test_split\n", "\n", "from keras.preprocessing import sequence\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.models import Sequential\n", "from keras.layers import Dense, Embedding, LSTM\n", "from keras.datasets import imdb\n", "\n", "from keras.utils.np_utils import to_categorical\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "import ssl\n", "ssl._create_default_https_context = ssl._create_unverified_context" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "sWmzIsFjCtDG", "colab_type": "text" }, "source": [ "### Preparing Dataset" ] }, { "cell_type": "code", "metadata": { "id": "TE1lgX_nCtDH", "colab_type": "code", "outputId": "43c7b68e-d7f5-4490-cde9-e99b57a133ea", "colab": { "base_uri": "https://localhost:8080/", "height": 163 } }, "source": [ "max_features = 1000\n", "maxlen = 80 # cut texts after this number of words (among top max_features most common words)\n", "batch_size = 32\n", "\n", "print('Loading data...')\n", "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n", "print(len(x_train), 'train sequences')\n", "print(len(x_test), 'test sequences')\n", "\n", "print('Pad sequences (samples x time)')\n", "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n", "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n", "print('x_train shape:', x_train.shape)\n", "print('x_test shape:', x_test.shape)" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "Loading data...\n", "Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz\n", "17465344/17464789 [==============================] - 1s 0us/step\n", "25000 train sequences\n", "25000 test sequences\n", "Pad sequences (samples x time)\n", "x_train shape: (25000, 80)\n", "x_test shape: (25000, 80)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "RrX2lqw1CtDO", "colab_type": "text" }, "source": [ "### Visualize the data" ] }, { "cell_type": "code", "metadata": { "id": "GV296kh6CtDP", "colab_type": "code", "outputId": "5d97985c-35a6-4904-c024-a684c9b488da", "colab": { "base_uri": "https://localhost:8080/", "height": 92 } }, "source": [ "INDEX_FROM=3 # word index offset\n", "\n", "word_to_id = imdb.get_word_index()\n", "word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}\n", "word_to_id[\"\"] = 0\n", "word_to_id[\"\"] = 1\n", "word_to_id[\"\"] = 2\n", "\n", "id_to_word = {value:key for key,value in word_to_id.items()}\n", "print(' '.join(id_to_word[id] for id in x_train[0] ))" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json\n", "1646592/1641221 [==============================] - 1s 0us/step\n", "that played the of and paul they were just brilliant children are often left out of the i think because the stars that play them all up are such a big for the whole film but these children are amazing and should be for what they have done don't you think the whole story was so because it was true and was life after all that was with us all\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "vIwm7DbuCtDU", "colab_type": "text" }, "source": [ "### Building a Model" ] }, { "cell_type": "code", "metadata": { "scrolled": true, "id": "LB4JVgYJCtDW", "colab_type": "code", "outputId": "1dcc3958-ebbf-489c-9763-d6678c60db5b", "colab": { "base_uri": "https://localhost:8080/", "height": 319 } }, "source": [ "print('Build model...')\n", "model = Sequential()\n", "model.add(Embedding(max_features, 8))\n", "model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "Build model...\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", "\n" ], "name": "stdout" }, { "output_type": "error", "ename": "NameError", "evalue": "ignored", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Build model...'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSequential\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mEmbedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m8\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLSTM\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m16\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecurrent_dropout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mactivation\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'sigmoid'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'max_features' is not defined" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "fi0cRt-QCtDa", "colab_type": "text" }, "source": [ "### Model Training" ] }, { "cell_type": "code", "metadata": { "id": "uSQxXg7eCtDd", "colab_type": "code", "outputId": "70fcf2fd-20a1-4e06-fe97-4f091ecff01f", "colab": { "base_uri": "https://localhost:8080/", "height": 237 } }, "source": [ "# try using different optimizers and different optimizer configs\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "\n", "# Write the training input and output, batch size, and testing input and output\n", "\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead.\n", "\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 52s 2ms/step - loss: 0.5450 - acc: 0.7235 - val_loss: 0.4540 - val_acc: 0.7929\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "viOMzEbKCtDh", "colab_type": "text" }, "source": [ "### Testing" ] }, { "cell_type": "code", "metadata": { "id": "2u_ADdgVCtDi", "colab_type": "code", "outputId": "68ad9d17-9dd0-45b8-c7d3-d6ea772ec261", "colab": {} }, "source": [ "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "24960/25000 [============================>.] - ETA: 0sTest score: 0.49805993225097656\n", "Test accuracy: 0.75544\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "9q3EWD4nCtDn", "colab_type": "text" }, "source": [ "### Prediction" ] }, { "cell_type": "code", "metadata": { "id": "9qk8CohCCtDo", "colab_type": "code", "outputId": "707c8b8d-49c7-4b9c-c912-3ecc589e4656", "colab": {} }, "source": [ "prediction = model.predict(x_test[2:3])\n", "print('Prediction value:',prediction[0])\n", "print('Test Label:',y_test[2:3])\n", "print(' '.join(id_to_word[id] for id in x_test[25] ))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Prediction value: [0.82189775]\n", "Test Label: [1]\n", " that should be viewing for all has its as well but for other than reason today is a example of the left in full and given the times the 7 such seemed not that great a from the truth but years later the has and with it's is a pretty silly \n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "fwbxUlIeCtDs", "colab_type": "text" }, "source": [ "### Other RNN Layers\n", "\n", "* keras.layers.RNN(cell, return_sequences=False)\n", "* keras.layers.SimpleRNN(units, activation='tanh')\n", "* keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')\n", "* keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )\n", "* keras.layers.SimpleRNNCell(units, activation='tanh')\n", "* keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')\n", "* keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')\n", "* keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')\n", "* keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')" ] }, { "cell_type": "markdown", "metadata": { "id": "nipqdbdwCtDu", "colab_type": "text" }, "source": [ "# Part 2: RNN Design Choices" ] }, { "cell_type": "markdown", "metadata": { "id": "oj3P_6JdCtDw", "colab_type": "text" }, "source": [ "## Influence of number of nodes" ] }, { "cell_type": "markdown", "metadata": { "id": "9sHR_d4TCtDx", "colab_type": "text" }, "source": [ "### LSTM with 8 nodes" ] }, { "cell_type": "code", "metadata": { "id": "yTU64BjqCtDz", "colab_type": "code", "outputId": "7aa68d78-1e70-4bc2-de31-2e53d6436616", "colab": {} }, "source": [ "model = Sequential()\n", "model.add(Embedding(max_features, 8))\n", "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_3 (Embedding) (None, None, 8) 8000 \n", "_________________________________________________________________\n", "lstm_3 (LSTM) (None, 8) 544 \n", "_________________________________________________________________\n", "dense_3 (Dense) (None, 1) 9 \n", "=================================================================\n", "Total params: 8,553\n", "Trainable params: 8,553\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 69s - loss: 0.5377 - acc: 0.7186 - val_loss: 0.4416 - val_acc: 0.7936\n", "24960/25000 [============================>.] - ETA: 0sTest score: 0.441626269493103\n", "Test accuracy: 0.79364\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "wFRHd006CtD3", "colab_type": "text" }, "source": [ "### LSTM with 16 nodes" ] }, { "cell_type": "code", "metadata": { "id": "CHjkgOD4CtD4", "colab_type": "code", "outputId": "37d5ea5a-164b-4157-fb52-129e9634e97e", "colab": {} }, "source": [ "#not tried\n", "# Write your code here \n", "model = Sequential()\n", "model.add(Embedding(max_features, 8))\n", "model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)\n", "# Use the same layer design from the above cell " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_4 (Embedding) (None, None, 8) 8000 \n", "_________________________________________________________________\n", "lstm_4 (LSTM) (None, 16) 1600 \n", "_________________________________________________________________\n", "dense_4 (Dense) (None, 1) 17 \n", "=================================================================\n", "Total params: 9,617\n", "Trainable params: 9,617\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 70s - loss: 0.5149 - acc: 0.7333 - val_loss: 0.4129 - val_acc: 0.8124\n", "24960/25000 [============================>.] - ETA: 0sTest score: 0.4128888432312012\n", "Test accuracy: 0.81236\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "vCnAs1gsCtD9", "colab_type": "text" }, "source": [ "## Influence of Embedding" ] }, { "cell_type": "code", "metadata": { "id": "wXBRbnHKCtD-", "colab_type": "code", "outputId": "254004d8-17ce-4d6a-9abf-96ef91e2d354", "colab": {} }, "source": [ "model = Sequential()\n", "model.add(Embedding(max_features, 4))\n", "model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_5 (Embedding) (None, None, 4) 4000 \n", "_________________________________________________________________\n", "lstm_5 (LSTM) (None, 16) 1344 \n", "_________________________________________________________________\n", "dense_5 (Dense) (None, 1) 17 \n", "=================================================================\n", "Total params: 5,361\n", "Trainable params: 5,361\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 66s - loss: 0.5176 - acc: 0.7263 - val_loss: 0.4116 - val_acc: 0.8124\n", "24960/25000 [============================>.] - ETA: 0sTest score: 0.41163202223777773\n", "Test accuracy: 0.81236\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "0rGDKWtnCtEC", "colab_type": "text" }, "source": [ "## Influence of Dropout" ] }, { "cell_type": "markdown", "metadata": { "id": "7kfBqNM8CtED", "colab_type": "text" }, "source": [ "### Dropout with probability 0.5" ] }, { "cell_type": "code", "metadata": { "id": "hcqQZmlrCtEE", "colab_type": "code", "outputId": "7c26da62-a01a-459b-9f0f-7f1eb50e28ff", "colab": {} }, "source": [ "model = Sequential()\n", "model.add(Embedding(max_features, 32))\n", "model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_6 (Embedding) (None, None, 32) 32000 \n", "_________________________________________________________________\n", "lstm_6 (LSTM) (None, 8) 1312 \n", "_________________________________________________________________\n", "dense_6 (Dense) (None, 1) 9 \n", "=================================================================\n", "Total params: 33,321\n", "Trainable params: 33,321\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 74s - loss: 0.6050 - acc: 0.6698 - val_loss: 0.5219 - val_acc: 0.7405\n", "24896/25000 [============================>.] - ETA: 0sTest score: 0.521893192024231\n", "Test accuracy: 0.74052\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "i5QNJ37XCtEJ", "colab_type": "text" }, "source": [ "### Dropout with probability 0.9" ] }, { "cell_type": "code", "metadata": { "id": "JwyDaT9eCtEK", "colab_type": "code", "colab": {} }, "source": [ "# Write your code here \n", "\n", "# Use the same model design from the above cell " ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "CCRL0v0ECtEP", "colab_type": "text" }, "source": [ "## Multilayered RNNs" ] }, { "cell_type": "markdown", "metadata": { "id": "rXImrGnLCtEQ", "colab_type": "text" }, "source": [ "### RNN with 2 layer LSTM" ] }, { "cell_type": "code", "metadata": { "id": "G-PwWdhbCtER", "colab_type": "code", "outputId": "bba1c799-e1ee-4661-fd58-0e5b4ec46bcb", "colab": {} }, "source": [ "model = Sequential()\n", "model.add(Embedding(max_features, 8))\n", "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))\n", "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_7 (Embedding) (None, None, 8) 8000 \n", "_________________________________________________________________\n", "lstm_7 (LSTM) (None, None, 8) 544 \n", "_________________________________________________________________\n", "lstm_8 (LSTM) (None, 8) 544 \n", "_________________________________________________________________\n", "dense_7 (Dense) (None, 1) 9 \n", "=================================================================\n", "Total params: 9,097\n", "Trainable params: 9,097\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 104s - loss: 0.5088 - acc: 0.7404 - val_loss: 0.4145 - val_acc: 0.8114\n", "24928/25000 [============================>.] - ETA: 0sTest score: 0.4145219009399414\n", "Test accuracy: 0.81136\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "ibb2ZoTkCtEW", "colab_type": "text" }, "source": [ "### RNN with 3 layer LSTM" ] }, { "cell_type": "code", "metadata": { "id": "TIKdVfv_CtEY", "colab_type": "code", "outputId": "72715019-57e1-496d-a1fb-92415928f726", "colab": {} }, "source": [ "# Write your code here \n", "model = Sequential()\n", "model.add(Embedding(max_features, 8))\n", "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))\n", "model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))\n", "model.add(Dense(1, activation='sigmoid'))\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))\n", "\n", "score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)\n", "print('Test score:', score)\n", "print('Test accuracy:', acc)\n", "# Use the same node design from the above cell " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_8 (Embedding) (None, None, 8) 8000 \n", "_________________________________________________________________\n", "lstm_9 (LSTM) (None, None, 8) 544 \n", "_________________________________________________________________\n", "lstm_10 (LSTM) (None, 8) 544 \n", "_________________________________________________________________\n", "dense_8 (Dense) (None, 1) 9 \n", "=================================================================\n", "Total params: 9,097\n", "Trainable params: 9,097\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Train on 25000 samples, validate on 25000 samples\n", "Epoch 1/1\n", "25000/25000 [==============================] - 107s - loss: 0.5471 - acc: 0.7128 - val_loss: 0.4698 - val_acc: 0.7706\n", "24960/25000 [============================>.] - ETA: 0sTest score: 0.4697502194404602\n", "Test accuracy: 0.7706\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "RxBVbPCtCtEd", "colab_type": "text" }, "source": [ "### What are your findings?" ] }, { "cell_type": "markdown", "metadata": { "id": "Lc_SiABPCtEf", "colab_type": "text" }, "source": [ "# Part 3: Recurrent Neural Network with Custom Dataset" ] }, { "cell_type": "code", "metadata": { "id": "0HdkQ_Z9CtEk", "colab_type": "code", "colab": {} }, "source": [ "# Credits to Peter Nagy" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "xCc2eIIICtEq", "colab_type": "text" }, "source": [ "### Load data" ] }, { "cell_type": "code", "metadata": { "id": "DO7umetyCtEr", "colab_type": "code", "colab": {} }, "source": [ "data = pd.read_csv('Senti.csv')\n", "# Keeping only the neccessary columns\n", "data = data[['text','sentiment']]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "id": "NYgV00eECtEw", "colab_type": "text" }, "source": [ "### Visualize data" ] }, { "cell_type": "code", "metadata": { "id": "rmy-XJXFCtEx", "colab_type": "code", "outputId": "3d390731-26ea-431a-b882-bfaa036199e0", "colab": {} }, "source": [ "data.head(10)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentiment
0I love this carPositive
1This view is amazingPositive
2I feel great this morningPositive
3I am so excited about the concertPositive
4He is my best friendPositive
5I do not like this carNegative
6This view is horribleNegative
7I feel tired this morningNegative
8I am not looking forward to the concertNegative
9He is my enemyNegative
\n", "
" ], "text/plain": [ " text sentiment\n", "0 I love this car Positive\n", "1 This view is amazing Positive\n", "2 I feel great this morning Positive\n", "3 I am so excited about the concert Positive\n", "4 He is my best friend Positive\n", "5 I do not like this car Negative\n", "6 This view is horrible Negative\n", "7 I feel tired this morning Negative\n", "8 I am not looking forward to the concert Negative\n", "9 He is my enemy Negative" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "wEikVF22CtE1", "colab_type": "text" }, "source": [ "### Format data" ] }, { "cell_type": "code", "metadata": { "id": "xJQLuBfCCtE3", "colab_type": "code", "outputId": "a62efb30-bc97-450f-9475-0cfae61c2db3", "colab": {} }, "source": [ "data = data[data.sentiment != \"Neutral\"]\n", "data['text'] = data['text'].apply(lambda x: x.lower())\n", "data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]','',x)))\n", "\n", "print('Number of positive samples:',data[ data['sentiment'] == 'Positive'].size)\n", "print('Number of negative samples:',data[ data['sentiment'] == 'Negative'].size)\n", "\n", "for idx,row in data.iterrows():\n", " row[0] = row[0].replace('rt',' ')\n", "\n", "max_fatures = 2000\n", "tokenizer = Tokenizer(nb_words=max_fatures, split=' ')\n", "tokenizer.fit_on_texts(data['text'].values)\n", "X = tokenizer.texts_to_sequences(data['text'].values)\n", "X = pad_sequences(X)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Number of positive samples: 10\n", "Number of negative samples: 10\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "HDCY0n9FCtE8", "colab_type": "text" }, "source": [ "### Training set" ] }, { "cell_type": "code", "metadata": { "id": "FUeSohbaCtE9", "colab_type": "code", "outputId": "5139a773-388d-4222-a18f-8b996c9682e4", "colab": {} }, "source": [ "Y = pd.get_dummies(data['sentiment']).values\n", "X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)\n", "print('Shape of training samples:',X_train.shape,Y_train.shape)\n", "print('Shape of testing samples:',X_test.shape,Y_test.shape)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Shape of training samples: (6, 8) (6, 2)\n", "Shape of testing samples: (4, 8) (4, 2)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "kLgxulzzCtFA", "colab_type": "text" }, "source": [ "### Design a model" ] }, { "cell_type": "code", "metadata": { "id": "Go3IKrztCtFB", "colab_type": "code", "outputId": "bb12eece-9692-4a3f-98e5-644d2c117265", "colab": {} }, "source": [ "model = Sequential()\n", "model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1], dropout=0.2))\n", "model.add(LSTM(128))\n", "model.add(Dense(2, activation='softmax'))\n", "model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])\n", "print(model.summary())" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_2 (Embedding) (None, 8, 128) 256000 \n", "_________________________________________________________________\n", "lstm_2 (LSTM) (None, 128) 131584 \n", "_________________________________________________________________\n", "dense_2 (Dense) (None, 2) 258 \n", "=================================================================\n", "Total params: 387,842\n", "Trainable params: 387,842\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "aJQHBOZICtFF", "colab_type": "text" }, "source": [ "### Training " ] }, { "cell_type": "code", "metadata": { "id": "mIfPTUFTCtFG", "colab_type": "code", "outputId": "57dacf7b-a303-4c24-ca22-513d40ae2130", "colab": {} }, "source": [ "batch_size = 32\n", "model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/5\n", "0s - loss: 0.6946 - acc: 0.3333\n", "Epoch 2/5\n", "0s - loss: 0.6864 - acc: 0.6667\n", "Epoch 3/5\n", "0s - loss: 0.6782 - acc: 0.6667\n", "Epoch 4/5\n", "0s - loss: 0.6698 - acc: 0.6667\n", "Epoch 5/5\n", "0s - loss: 0.6607 - acc: 0.6667\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "OZrR_N4QCtFL", "colab_type": "text" }, "source": [ "### Validation" ] }, { "cell_type": "code", "metadata": { "id": "IajDGeVSCtFM", "colab_type": "code", "outputId": "5e23de55-a0d2-4db7-b671-8b60b5628532", "colab": {} }, "source": [ "score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)\n", "print(\"Score: %.2f\" % (score))\n", "print(\"Accuracy: %.2f\" % (acc))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Score: 0.73\n", "Accuracy: 0.25\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "ypTU6dTxCtFS", "colab_type": "text" }, "source": [ "### Formatting Test Example" ] }, { "cell_type": "code", "metadata": { "id": "gJBsxFN1CtFT", "colab_type": "code", "outputId": "df201139-7ab8-456c-f164-82d50a3900e1", "colab": {} }, "source": [ "text = 'He is my enemy'\n", "tester = np.array([text])\n", "tester = pd.DataFrame(tester)\n", "tester.columns = ['text']\n", "\n", "tester['text'] = tester['text'].apply(lambda x: x.lower())\n", "tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]','',x)))\n", "\n", "max_fatures = 2000\n", "test = tokenizer.texts_to_sequences(tester['text'].values)\n", "test = pad_sequences(test)\n", "\n", "if X.shape[1]>test.shape[1]:\n", " test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')\n", " \n", "test = np.array([test])\n", "\n", "prediction = model.predict(test)\n", "print('Prediction value:',prediction[0])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Prediction value: [0.53419375 0.46580625]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "1AcumMsRCtFX", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }