{
  "cells": [
    {
      "metadata": {
        "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
        "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nimport os\nprint(os.listdir(\"../input\"))\n\n# Any results you write to the current directory are saved as output.",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
        "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "from keras.models import Model\nfrom keras.layers import Input, CuDNNLSTM, Dense\nimport numpy as np",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "fcd7da31-a545-41a8-ae89-376b2a8f25e8",
        "_uuid": "66c19897a47dc4565c6c24ed29d4dccada1f0d71",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "batch_size = 64  # Batch size for training.\nepochs = 100  # Number of epochs to train for.\nlatent_dim = 256  # Latent dimensionality of the encoding space.\nnum_samples = 10000  # Number of samples to train on.\n# Path to the data txt file on disk.\ndata_path = '../input/fra-eng/fra.txt'",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "ea0e2e7f-f65e-4fe6-b75e-304efb56f04f",
        "_uuid": "06a6c35f13c93a188005227a3b98fc9b7b21a201",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "# Vectorize the data.\ninput_texts = []\ntarget_texts = []\ninput_characters = set()\ntarget_characters = set()\n\n# Loop over lines\nlines = open(data_path).read().split('\\n')\nfor line in lines[: min(num_samples, len(lines) - 1)]:\n    # Input and target are split by tabs\n    # English TAB French\n    input_text, target_text = line.split('\\t')\n    \n    # We use \"tab\" as the \"start sequence\" character\n    # for the targets, and \"\\n\" as \"end sequence\" character.\n    target_text = '\\t' + target_text + '\\n'\n    input_texts.append(input_text)\n    target_texts.append(target_text)\n    \n    # Create a set of all unique characters in the input\n    for char in input_text:\n        if char not in input_characters:\n            input_characters.add(char)\n            \n    # Create a set of all unique output characters\n    for char in target_text:\n        if char not in target_characters:\n            target_characters.add(char)\n\nprint('Number of samples:', len(input_texts))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "de998ecc-341a-4a47-830a-1dc20f0ae904",
        "_uuid": "555c970d2117022365512bf3d84246a909f44fc2",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "input_characters = sorted(list(input_characters)) # Make sure we achieve the same order in our input chars\ntarget_characters = sorted(list(target_characters))\nnum_encoder_tokens = len(input_characters) # aka size of the english alphabet + numbers, signs, etc.\nnum_decoder_tokens = len(target_characters) # aka size of the french alphabet + numbers, signs, etc.\n\n\nprint('Number of unique input tokens:', num_encoder_tokens)\nprint('Number of unique output tokens:', num_decoder_tokens)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "b99e23e7-2faa-4ce7-a6f3-784e02ef46c7",
        "_uuid": "a52af2a93fd956dd9a33c02baf3aa5a73bd7a1ec",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# This works very similar to a tokenizer\n# The index maps a character to a number\ninput_token_index = {char: i for i, char in enumerate(input_characters)}\ntarget_token_index = {char: i for i, char in enumerate(target_characters)}",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "76afcd7b-23bd-44c3-b3fb-978cba081f21",
        "_uuid": "43f7153aa4595b48da4338c300712a7c16fcfa01",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "# Demo character tokenization\nfor c in 'the cat sits on the mat':\n    print(input_token_index[c], end = ' ')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "7d4ad6ea-17fc-479a-87fe-3822fdd161c2",
        "_uuid": "b88478d0f15b9642f905395d8c45445f628719b2",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "max_encoder_seq_length = max([len(txt) for txt in input_texts]) # Get longest sequences length\nmax_decoder_seq_length = max([len(txt) for txt in target_texts])\n\nprint('Max sequence length for inputs:', max_encoder_seq_length)\nprint('Max sequence length for outputs:', max_decoder_seq_length)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "2c39c2c1-072c-4e5c-99bd-e13a62ebed04",
        "_uuid": "7231966fbdb13bffb3805d69856d354acb1d15ce",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# encoder_input_data is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) \n# containing a one-hot vectorization of the English sentences.\n\nencoder_input_data = np.zeros(\n    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),\n    dtype='float32')\n\n# decoder_input_data is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) \n# containg a one-hot vectorization of the French sentences.\n\ndecoder_input_data = np.zeros(\n    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),\n    dtype='float32')\n\n# decoder_target_data is the same as decoder_input_data but offset by one timestep. \n# decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]\n\ndecoder_target_data = np.zeros(\n    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),\n    dtype='float32')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "8d1014c8-82a6-4988-a31e-33bd3498b585",
        "_uuid": "d12785864708eaa8feec3b7aba862b02bec76e3b",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Loop over input texts\nfor i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):\n    # Loop over each char in an input text\n    for t, char in enumerate(input_text):\n        # Create one hot encoding by setting the index to 1\n        encoder_input_data[i, t, input_token_index[char]] = 1.\n    # Loop over each char in the output text\n    for t, char in enumerate(target_text):\n        # decoder_target_data is ahead of decoder_input_data by one timestep\n        decoder_input_data[i, t, target_token_index[char]] = 1.\n        if t > 0:\n            # decoder_target_data will be ahead by one timestep\n            # and will not include the start character.\n            decoder_target_data[i, t - 1, target_token_index[char]] = 1.",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "909d7eb1-54f5-4b96-8745-34080104dcc4",
        "_uuid": "14f0c33f929a0ef7aaf1fa23cff824ecb4b4fe6f",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "\n# Define an input sequence and process it.\nencoder_inputs = Input(shape=(None, num_encoder_tokens), \n                       name = 'encoder_inputs')\n\n# The return_state contructor argument, configuring a RNN layer to return a list \n# where the first entry is the outputs and the next entries are the internal RNN states. \n# This is used to recover the states of the encoder.\nencoder = CuDNNLSTM(latent_dim, \n                    return_state=True, \n                    name = 'encoder')\n\nencoder_outputs, state_h, state_c = encoder(encoder_inputs)\n# We discard `encoder_outputs` and only keep the states.\nencoder_states = [state_h, state_c]\n\n# Set up the decoder, using `encoder_states` as initial state.\ndecoder_inputs = Input(shape=(None, num_decoder_tokens), \n                       name = 'decoder_inputs')\n\n# We set up our decoder to return full output sequences,\n# and to return internal states as well. We don't use the\n# return states in the training model, but we will use them in inference.\ndecoder_lstm = CuDNNLSTM(latent_dim, \n                         return_sequences=True, \n                         return_state=True, \n                         name = 'decoder_lstm')\n\n# The inital_state call argument, specifying the initial state(s) of a RNN. \n# This is used to pass the encoder states to the decoder as initial states.\n# Basically making the first memory of the decoder the encoded semantics\ndecoder_outputs, _, _ = decoder_lstm(decoder_inputs,\n                                     initial_state=encoder_states)\n\ndecoder_dense = Dense(num_decoder_tokens, \n                      activation='softmax', \n                      name = 'decoder_dense')\ndecoder_outputs = decoder_dense(decoder_outputs)\n\n# Define the model that will turn\n# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`\nmodel = Model([encoder_inputs, decoder_inputs], decoder_outputs)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "1e70cde9-4c5d-4de2-8f41-90a7c5bd3475",
        "_uuid": "72492095eea44a02d9f392b2274606382cf11f64",
        "scrolled": true,
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "# Run training\nmodel.compile(optimizer='rmsprop', loss='categorical_crossentropy')\nhistory = model.fit([encoder_input_data, decoder_input_data], \n                    decoder_target_data,\n                    batch_size=batch_size,\n                    epochs=epochs,\n                    validation_split=0.2)\n# Save model\n#model.save('s2s.h5')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "55d758b0-b642-4974-b028-82f920d4602b",
        "_uuid": "ca7c6f484ebbe4aed426744aa2197cf194c3ba26",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "import matplotlib.pyplot as plt\nplt.figure(figsize=(10,7))\na, = plt.plot(history.history['loss'],label='Training Loss')\nb, = plt.plot(history.history['val_loss'],label='Validation Loss')\nplt.legend(handles=[a,b])\nplt.show()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "2ad8f7e4-72a4-4884-8be4-d9fb4e1bd126",
        "_uuid": "61c95ebe8356e8f91cd8affc10e14acc1596421d"
      },
      "cell_type": "markdown",
      "source": "# Creating inference models"
    },
    {
      "metadata": {
        "_cell_guid": "239a7196-6485-4f65-8dea-4ca1a9aa87df",
        "_uuid": "1c10010e6db47a30edd82bee80e77a0241a79fcf",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Define encoder model\nencoder_model = Model(encoder_inputs, encoder_states)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "bb95b6a7-772b-458a-bd23-411e98e32205",
        "_uuid": "343f0049794312cf4fd1511c5f34ce36a9e26982",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Define decoder model\n\n# Inputs from the encoder\ndecoder_state_input_h = Input(shape=(latent_dim,))\ndecoder_state_input_c = Input(shape=(latent_dim,))\n\n# Create a combined memory to input into the decoder\ndecoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n\n# Decoder\ndecoder_outputs, state_h, state_c = decoder_lstm(\n    decoder_inputs, initial_state=decoder_states_inputs)\ndecoder_states = [state_h, state_c]\n\n# Predict next char\ndecoder_outputs = decoder_dense(decoder_outputs)\n\n# The model takes in the encoder memory plus it's own memory as an input and spits out \n# a prediction plus its own memory to be used for the next char\ndecoder_model = Model(\n    [decoder_inputs] + decoder_states_inputs,\n    [decoder_outputs] + decoder_states)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "5f19066a-c7e4-421b-a6cf-a2f463fd75fb",
        "_uuid": "c3c1dd185d7e8532f088903bf634f6c14d18fb2d",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Reverse-lookup token index to decode sequences back to\n# something readable.\nreverse_input_char_index = {i: char \n                            for char, i in input_token_index.items()}\nreverse_target_char_index = {i: char \n                             for char, i in target_token_index.items()}",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "d6e067d6-e27f-47e9-b36d-8c8469fef085",
        "_uuid": "a1b14b5e6f0a1c34d6ffd3a27b302e79c0d31ea5",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "def decode_sequence(input_seq):\n    # Encode the input as state vectors.\n    states_value = encoder_model.predict(input_seq)\n\n    # Generate empty target sequence of length 1.\n    target_seq = np.zeros((1, 1, num_decoder_tokens))\n    # Populate the first character of target sequence with the start character.\n    target_seq[0, 0, target_token_index['\\t']] = 1.\n\n    # Sampling loop for a batch of sequences\n    # (to simplify, here we assume a batch of size 1).\n    stop_condition = False\n    decoded_sentence = ''\n    \n    # Loop untill we recieve a stop sign\n    while not stop_condition:\n        # Get output and internal states of the decoder \n        output_tokens, h, c = decoder_model.predict(\n            [target_seq] + states_value)\n\n        # Get the predicted token (the token with the highest score)\n        sampled_token_index = np.argmax(output_tokens[0, -1, :])\n        # Get the character belonging to the token\n        sampled_char = reverse_target_char_index[sampled_token_index]\n        # Append char to output\n        decoded_sentence += sampled_char\n\n        # Exit condition: either hit max length\n        # or find stop character.\n        if (sampled_char == '\\n' or\n           len(decoded_sentence) > max_decoder_seq_length):\n            stop_condition = True\n\n        # Update the target sequence (of length 1).\n        target_seq = np.zeros((1, 1, num_decoder_tokens))\n        target_seq[0, 0, sampled_token_index] = 1.\n\n        # Update states\n        states_value = [h, c]\n\n    return decoded_sentence",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "c171e7a9-5642-43b8-ac4d-a0150ad20e14",
        "_uuid": "3a78fc8d64f0bca2650fd73c6356fd0b5a6bf171",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "my_text = 'Thanks!'\nplaceholder = np.zeros((1,len(my_text)+10,num_encoder_tokens))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "e211bbb7-1dc7-40aa-96b9-d9372adea403",
        "_uuid": "10d39df3fdd008e15853d873c4a496a0568a872a",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "for i, char in enumerate(my_text):\n    print(i,char, input_token_index[char])\n    placeholder[0,i,input_token_index[char]] = 1",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "9c45d63b-fcec-4611-bc76-59b1c5322eb9",
        "_uuid": "d94521498349ca4e3a8010f283f9e4057a5b8e78",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "decode_sequence(placeholder)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "5f27b5be-c657-4b6e-8484-d53615f3fab8",
        "_uuid": "510249e948055ee16fe81474b22eafd3055bd310",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.4",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}