{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# RNN using LSTM \n",
    "       \n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"../imgs/RNN-rolled.png\"/ width=\"80px\" height=\"80px\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"../imgs/RNN-unrolled.png\"/ width=\"400px\" height=\"400px\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"../imgs/LSTM3-chain.png\"/ width=\"60%\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_source: http://colah.github.io/posts/2015-08-Understanding-LSTMs_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.optimizers import SGD\n",
    "from keras.preprocessing.text import one_hot, text_to_word_sequence\n",
    "from keras.utils import np_utils\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.recurrent import LSTM, GRU\n",
    "from keras.preprocessing import sequence"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading blog post from data directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIRECTORY = os.path.join(os.path.abspath(os.path.curdir), '..', 'data', 'word_embeddings')\n",
    "print(DATA_DIRECTORY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "male_posts = []\n",
    "female_post = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(os.path.join(DATA_DIRECTORY,\"male_blog_list.txt\"),\"rb\") as male_file:\n",
    "    male_posts= pickle.load(male_file)\n",
    "    \n",
    "with open(os.path.join(DATA_DIRECTORY,\"female_blog_list.txt\"),\"rb\") as female_file:\n",
    "    female_posts = pickle.load(female_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filtered_male_posts = list(filter(lambda p: len(p) > 0, male_posts))\n",
    "filtered_female_posts = list(filter(lambda p: len(p) > 0, female_posts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# text processing - one hot builds index of the words\n",
    "male_one_hot = []\n",
    "female_one_hot = []\n",
    "n = 30000\n",
    "for post in filtered_male_posts:\n",
    "    try:\n",
    "        male_one_hot.append(one_hot(post, n, split=\" \", lower=True))\n",
    "    except:\n",
    "        continue\n",
    "\n",
    "for post in filtered_female_posts:\n",
    "    try:\n",
    "        female_one_hot.append(one_hot(post,n,split=\" \", lower=True))\n",
    "    except:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 0 for male, 1 for female\n",
    "concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),\n",
    "                                        np.ones(len(female_one_hot))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((female_one_hot,male_one_hot)),\n",
    "                                                                    concatenate_array_rnn, \n",
    "                                                                    test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen = 100\n",
    "X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)\n",
    "X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)\n",
    "print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)\n",
    "print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "max_features = 30000\n",
    "dimension = 128\n",
    "output_dimension = 128\n",
    "model = Sequential()\n",
    "model.add(Embedding(max_features, dimension))\n",
    "model.add(LSTM(output_dimension))\n",
    "model.add(Dropout(0.5))\n",
    "model.add(Dense(1))\n",
    "model.add(Activation('sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(X_train_rnn, y_train_rnn, batch_size=32,\n",
    "          epochs=4, validation_data=(X_test_rnn, y_test_rnn))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score, acc = model.evaluate(X_test_rnn, y_test_rnn, batch_size=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(score, acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using TFIDF Vectorizer as an input instead of one hot encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2', min_df=5)\n",
    "tfidf_male = vectorizer.fit_transform(filtered_male_posts)\n",
    "tfidf_female = vectorizer.fit_transform(filtered_female_posts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "flattened_array_tfidf_male = tfidf_male.toarray()\n",
    "flattened_array_tfidf_female = tfidf_male.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),\n",
    "                        np.ones(len(flattened_array_tfidf_female))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male, \n",
    "                                                                                    flattened_array_tfidf_female)),\n",
    "                                                                    y_rnn,test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen = 100\n",
    "X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)\n",
    "X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)\n",
    "print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)\n",
    "print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "max_features = 30000\n",
    "model = Sequential()\n",
    "model.add(Embedding(max_features, dimension))\n",
    "model.add(LSTM(output_dimension))\n",
    "model.add(Dropout(0.5))\n",
    "model.add(Dense(1))\n",
    "model.add(Activation('sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.compile(loss='mean_squared_error',optimizer='sgd', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(X_train_rnn, y_train_rnn, \n",
    "          batch_size=32, epochs=1,\n",
    "          validation_data=(X_test_rnn, y_test_rnn))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score,acc = model.evaluate(X_test_rnn, y_test_rnn, \n",
    "                           batch_size=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(score, acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentence Generation using LSTM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# reading all the male text data into one string\n",
    "male_post = ' '.join(filtered_male_posts)\n",
    "\n",
    "#building character set for the male posts\n",
    "character_set_male = set(male_post)\n",
    "#building two indices - character index and index of character\n",
    "char_indices = dict((c, i) for i, c in enumerate(character_set_male))\n",
    "indices_char = dict((i, c) for i, c in enumerate(character_set_male))\n",
    "\n",
    "\n",
    "# cut the text in semi-redundant sequences of maxlen characters\n",
    "maxlen = 20\n",
    "step = 1\n",
    "sentences = []\n",
    "next_chars = []\n",
    "for i in range(0, len(male_post) - maxlen, step):\n",
    "    sentences.append(male_post[i : i + maxlen])\n",
    "    next_chars.append(male_post[i + maxlen])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Vectorisation of input\n",
    "x_male = np.zeros((len(male_post), maxlen, len(character_set_male)), dtype=np.bool)\n",
    "y_male = np.zeros((len(male_post), len(character_set_male)), dtype=np.bool)\n",
    "\n",
    "print(x_male.shape, y_male.shape)\n",
    "\n",
    "for i, sentence in enumerate(sentences):\n",
    "    for t, char in enumerate(sentence):\n",
    "        x_male[i, t, char_indices[char]] = 1\n",
    "    y_male[i, char_indices[next_chars[i]]] = 1\n",
    "\n",
    "print(x_male.shape, y_male.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# build the model: a single LSTM\n",
    "print('Build model...')\n",
    "model = Sequential()\n",
    "model.add(LSTM(128, input_shape=(maxlen, len(character_set_male))))\n",
    "model.add(Dense(len(character_set_male)))\n",
    "model.add(Activation('softmax'))\n",
    "\n",
    "optimizer = RMSprop(lr=0.01)\n",
    "model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random, sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# helper function to sample an index from a probability array\n",
    "def sample(a, diversity=0.75):\n",
    "    if random.random() > diversity:\n",
    "        return np.argmax(a)\n",
    "    while 1:\n",
    "        i = random.randint(0, len(a)-1)\n",
    "        if a[i] > random.random():\n",
    "            return i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# train the model, output generated text after each iteration\n",
    "for iteration in range(1,10):\n",
    "    print()\n",
    "    print('-' * 50)\n",
    "    print('Iteration', iteration)\n",
    "    model.fit(x_male, y_male, batch_size=128, epochs=1)\n",
    "\n",
    "    start_index = random.randint(0, len(male_post) - maxlen - 1)\n",
    "\n",
    "    for diversity in [0.2, 0.4, 0.6, 0.8]:\n",
    "        print()\n",
    "        print('----- diversity:', diversity)\n",
    "\n",
    "        generated = ''\n",
    "        sentence = male_post[start_index : start_index + maxlen]\n",
    "        generated += sentence\n",
    "        print('----- Generating with seed: \"' + sentence + '\"')\n",
    "\n",
    "        for iteration in range(400):\n",
    "            try:\n",
    "                x = np.zeros((1, maxlen, len(character_set_male)))\n",
    "                for t, char in enumerate(sentence):\n",
    "                    x[0, t, char_indices[char]] = 1.\n",
    "\n",
    "                preds = model.predict(x, verbose=0)[0]\n",
    "                next_index = sample(preds, diversity)\n",
    "                next_char = indices_char[next_index]\n",
    "\n",
    "                generated += next_char\n",
    "                sentence = sentence[1:] + next_char\n",
    "            except:\n",
    "                continue\n",
    "                \n",
    "        print(sentence)\n",
    "        print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}