{ "cells": [ { "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n\nimport os\nprint(os.listdir(\"../input\"))\n\n# Any results you write to the current directory are saved as output.", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "from keras.models import Model\nfrom keras.layers import Input, CuDNNLSTM, Dense\nimport numpy as np", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "fcd7da31-a545-41a8-ae89-376b2a8f25e8", "_uuid": "66c19897a47dc4565c6c24ed29d4dccada1f0d71", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "batch_size = 64 # Batch size for training.\nepochs = 100 # Number of epochs to train for.\nlatent_dim = 256 # Latent dimensionality of the encoding space.\nnum_samples = 10000 # Number of samples to train on.\n# Path to the data txt file on disk.\ndata_path = '../input/fra-eng/fra.txt'", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "ea0e2e7f-f65e-4fe6-b75e-304efb56f04f", "_uuid": "06a6c35f13c93a188005227a3b98fc9b7b21a201", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "# Vectorize the data.\ninput_texts = []\ntarget_texts = []\ninput_characters = set()\ntarget_characters = set()\n\n# Loop over lines\nlines = open(data_path).read().split('\\n')\nfor line in lines[: min(num_samples, len(lines) - 1)]:\n # Input and target are split by tabs\n # English TAB French\n input_text, target_text = line.split('\\t')\n \n # We use \"tab\" as the \"start sequence\" character\n # for the targets, and \"\\n\" as \"end sequence\" character.\n target_text = '\\t' + target_text + '\\n'\n input_texts.append(input_text)\n target_texts.append(target_text)\n \n # Create a set of all unique characters in the input\n for char in input_text:\n if char not in input_characters:\n input_characters.add(char)\n \n # Create a set of all unique output characters\n for char in target_text:\n if char not in target_characters:\n target_characters.add(char)\n\nprint('Number of samples:', len(input_texts))", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "de998ecc-341a-4a47-830a-1dc20f0ae904", "_uuid": "555c970d2117022365512bf3d84246a909f44fc2", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "input_characters = sorted(list(input_characters)) # Make sure we achieve the same order in our input chars\ntarget_characters = sorted(list(target_characters))\nnum_encoder_tokens = len(input_characters) # aka size of the english alphabet + numbers, signs, etc.\nnum_decoder_tokens = len(target_characters) # aka size of the french alphabet + numbers, signs, etc.\n\n\nprint('Number of unique input tokens:', num_encoder_tokens)\nprint('Number of unique output tokens:', num_decoder_tokens)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "b99e23e7-2faa-4ce7-a6f3-784e02ef46c7", "_uuid": "a52af2a93fd956dd9a33c02baf3aa5a73bd7a1ec", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# This works very similar to a tokenizer\n# The index maps a character to a number\ninput_token_index = {char: i for i, char in enumerate(input_characters)}\ntarget_token_index = {char: i for i, char in enumerate(target_characters)}", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "76afcd7b-23bd-44c3-b3fb-978cba081f21", "_uuid": "43f7153aa4595b48da4338c300712a7c16fcfa01", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "# Demo character tokenization\nfor c in 'the cat sits on the mat':\n print(input_token_index[c], end = ' ')", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "7d4ad6ea-17fc-479a-87fe-3822fdd161c2", "_uuid": "b88478d0f15b9642f905395d8c45445f628719b2", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "max_encoder_seq_length = max([len(txt) for txt in input_texts]) # Get longest sequences length\nmax_decoder_seq_length = max([len(txt) for txt in target_texts])\n\nprint('Max sequence length for inputs:', max_encoder_seq_length)\nprint('Max sequence length for outputs:', max_decoder_seq_length)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "2c39c2c1-072c-4e5c-99bd-e13a62ebed04", "_uuid": "7231966fbdb13bffb3805d69856d354acb1d15ce", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# encoder_input_data is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) \n# containing a one-hot vectorization of the English sentences.\n\nencoder_input_data = np.zeros(\n (len(input_texts), max_encoder_seq_length, num_encoder_tokens),\n dtype='float32')\n\n# decoder_input_data is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) \n# containg a one-hot vectorization of the French sentences.\n\ndecoder_input_data = np.zeros(\n (len(input_texts), max_decoder_seq_length, num_decoder_tokens),\n dtype='float32')\n\n# decoder_target_data is the same as decoder_input_data but offset by one timestep. \n# decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]\n\ndecoder_target_data = np.zeros(\n (len(input_texts), max_decoder_seq_length, num_decoder_tokens),\n dtype='float32')", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "8d1014c8-82a6-4988-a31e-33bd3498b585", "_uuid": "d12785864708eaa8feec3b7aba862b02bec76e3b", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# Loop over input texts\nfor i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):\n # Loop over each char in an input text\n for t, char in enumerate(input_text):\n # Create one hot encoding by setting the index to 1\n encoder_input_data[i, t, input_token_index[char]] = 1.\n # Loop over each char in the output text\n for t, char in enumerate(target_text):\n # decoder_target_data is ahead of decoder_input_data by one timestep\n decoder_input_data[i, t, target_token_index[char]] = 1.\n if t > 0:\n # decoder_target_data will be ahead by one timestep\n # and will not include the start character.\n decoder_target_data[i, t - 1, target_token_index[char]] = 1.", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "909d7eb1-54f5-4b96-8745-34080104dcc4", "_uuid": "14f0c33f929a0ef7aaf1fa23cff824ecb4b4fe6f", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "\n# Define an input sequence and process it.\nencoder_inputs = Input(shape=(None, num_encoder_tokens), \n name = 'encoder_inputs')\n\n# The return_state contructor argument, configuring a RNN layer to return a list \n# where the first entry is the outputs and the next entries are the internal RNN states. \n# This is used to recover the states of the encoder.\nencoder = CuDNNLSTM(latent_dim, \n return_state=True, \n name = 'encoder')\n\nencoder_outputs, state_h, state_c = encoder(encoder_inputs)\n# We discard `encoder_outputs` and only keep the states.\nencoder_states = [state_h, state_c]\n\n# Set up the decoder, using `encoder_states` as initial state.\ndecoder_inputs = Input(shape=(None, num_decoder_tokens), \n name = 'decoder_inputs')\n\n# We set up our decoder to return full output sequences,\n# and to return internal states as well. We don't use the\n# return states in the training model, but we will use them in inference.\ndecoder_lstm = CuDNNLSTM(latent_dim, \n return_sequences=True, \n return_state=True, \n name = 'decoder_lstm')\n\n# The inital_state call argument, specifying the initial state(s) of a RNN. \n# This is used to pass the encoder states to the decoder as initial states.\n# Basically making the first memory of the decoder the encoded semantics\ndecoder_outputs, _, _ = decoder_lstm(decoder_inputs,\n initial_state=encoder_states)\n\ndecoder_dense = Dense(num_decoder_tokens, \n activation='softmax', \n name = 'decoder_dense')\ndecoder_outputs = decoder_dense(decoder_outputs)\n\n# Define the model that will turn\n# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`\nmodel = Model([encoder_inputs, decoder_inputs], decoder_outputs)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "1e70cde9-4c5d-4de2-8f41-90a7c5bd3475", "_uuid": "72492095eea44a02d9f392b2274606382cf11f64", "scrolled": true, "trusted": false, "collapsed": true }, "cell_type": "code", "source": "# Run training\nmodel.compile(optimizer='rmsprop', loss='categorical_crossentropy')\nhistory = model.fit([encoder_input_data, decoder_input_data], \n decoder_target_data,\n batch_size=batch_size,\n epochs=epochs,\n validation_split=0.2)\n# Save model\n#model.save('s2s.h5')", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "55d758b0-b642-4974-b028-82f920d4602b", "_uuid": "ca7c6f484ebbe4aed426744aa2197cf194c3ba26", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "import matplotlib.pyplot as plt\nplt.figure(figsize=(10,7))\na, = plt.plot(history.history['loss'],label='Training Loss')\nb, = plt.plot(history.history['val_loss'],label='Validation Loss')\nplt.legend(handles=[a,b])\nplt.show()", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "2ad8f7e4-72a4-4884-8be4-d9fb4e1bd126", "_uuid": "61c95ebe8356e8f91cd8affc10e14acc1596421d" }, "cell_type": "markdown", "source": "# Creating inference models" }, { "metadata": { "_cell_guid": "239a7196-6485-4f65-8dea-4ca1a9aa87df", "_uuid": "1c10010e6db47a30edd82bee80e77a0241a79fcf", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# Define encoder model\nencoder_model = Model(encoder_inputs, encoder_states)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "bb95b6a7-772b-458a-bd23-411e98e32205", "_uuid": "343f0049794312cf4fd1511c5f34ce36a9e26982", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# Define decoder model\n\n# Inputs from the encoder\ndecoder_state_input_h = Input(shape=(latent_dim,))\ndecoder_state_input_c = Input(shape=(latent_dim,))\n\n# Create a combined memory to input into the decoder\ndecoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n\n# Decoder\ndecoder_outputs, state_h, state_c = decoder_lstm(\n decoder_inputs, initial_state=decoder_states_inputs)\ndecoder_states = [state_h, state_c]\n\n# Predict next char\ndecoder_outputs = decoder_dense(decoder_outputs)\n\n# The model takes in the encoder memory plus it's own memory as an input and spits out \n# a prediction plus its own memory to be used for the next char\ndecoder_model = Model(\n [decoder_inputs] + decoder_states_inputs,\n [decoder_outputs] + decoder_states)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "5f19066a-c7e4-421b-a6cf-a2f463fd75fb", "_uuid": "c3c1dd185d7e8532f088903bf634f6c14d18fb2d", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "# Reverse-lookup token index to decode sequences back to\n# something readable.\nreverse_input_char_index = {i: char \n for char, i in input_token_index.items()}\nreverse_target_char_index = {i: char \n for char, i in target_token_index.items()}", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "d6e067d6-e27f-47e9-b36d-8c8469fef085", "_uuid": "a1b14b5e6f0a1c34d6ffd3a27b302e79c0d31ea5", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "def decode_sequence(input_seq):\n # Encode the input as state vectors.\n states_value = encoder_model.predict(input_seq)\n\n # Generate empty target sequence of length 1.\n target_seq = np.zeros((1, 1, num_decoder_tokens))\n # Populate the first character of target sequence with the start character.\n target_seq[0, 0, target_token_index['\\t']] = 1.\n\n # Sampling loop for a batch of sequences\n # (to simplify, here we assume a batch of size 1).\n stop_condition = False\n decoded_sentence = ''\n \n # Loop untill we recieve a stop sign\n while not stop_condition:\n # Get output and internal states of the decoder \n output_tokens, h, c = decoder_model.predict(\n [target_seq] + states_value)\n\n # Get the predicted token (the token with the highest score)\n sampled_token_index = np.argmax(output_tokens[0, -1, :])\n # Get the character belonging to the token\n sampled_char = reverse_target_char_index[sampled_token_index]\n # Append char to output\n decoded_sentence += sampled_char\n\n # Exit condition: either hit max length\n # or find stop character.\n if (sampled_char == '\\n' or\n len(decoded_sentence) > max_decoder_seq_length):\n stop_condition = True\n\n # Update the target sequence (of length 1).\n target_seq = np.zeros((1, 1, num_decoder_tokens))\n target_seq[0, 0, sampled_token_index] = 1.\n\n # Update states\n states_value = [h, c]\n\n return decoded_sentence", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "c171e7a9-5642-43b8-ac4d-a0150ad20e14", "_uuid": "3a78fc8d64f0bca2650fd73c6356fd0b5a6bf171", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "my_text = 'Thanks!'\nplaceholder = np.zeros((1,len(my_text)+10,num_encoder_tokens))", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "e211bbb7-1dc7-40aa-96b9-d9372adea403", "_uuid": "10d39df3fdd008e15853d873c4a496a0568a872a", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "for i, char in enumerate(my_text):\n print(i,char, input_token_index[char])\n placeholder[0,i,input_token_index[char]] = 1", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "9c45d63b-fcec-4611-bc76-59b1c5322eb9", "_uuid": "d94521498349ca4e3a8010f283f9e4057a5b8e78", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "decode_sequence(placeholder)", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "5f27b5be-c657-4b6e-8484-d53615f3fab8", "_uuid": "510249e948055ee16fe81474b22eafd3055bd310", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.6.4", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" } }, "nbformat": 4, "nbformat_minor": 1 }