In [1]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import numpy as np
np.random.seed(13)

Using TensorFlow backend.


In [2]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/cache/epub/11/pg11.txt')
doc = open(path).readlines()[0:50]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc)
doc = tokenizer.texts_to_sequences(doc)
doc = [l for l in doc if len(l) > 1]
words_size = sum([len(words) - 1 for words in doc])

In [3]:
maxlen = max([len(x)-1 for x in doc])
vocab_size = len(tokenizer.word_index) + 1

In [4]:
def generate_data(X, maxlen, V):
 for sentence in X: 
 inputs = []
 targets = []
 for i in range(1, len(sentence)):
 inputs.append(sentence[0:i])
 targets.append(sentence[i])
 y = np_utils.to_categorical(targets, V)
 inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
 yield (inputs_sequence, y)


In [5]:
def sample(p):
 p /= sum(p)
 return np.where(np.random.multinomial(1, p, 1)==1)[1][0]

In [6]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=maxlen))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [7]:
for i in range(30):
 for x, y in generate_data(doc, maxlen, vocab_size):
 model.train_on_batch(x, y)

 in_words = "alice's"
 for _ in range(maxlen):
 in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
 wordid = sample(model.predict(in_sequence)[0])
 for k, v in tokenizer.word_index.items():
 if v == wordid:
 in_words += ' ' + k
 break

 print(i, in_words)

0 alice's for 25 3 her hot bank author what 'without pleasure feel posting at thought
1 alice's mind ' into no conversations and pictures sitting as away the re in copy
2 alice's anyone get and hole once hot it 25 language was worth rabbit chapter
3 alice's mind carroll a ' anywhere terms date thought org up restrictions was license carroll
4 alice's millennium well carroll gutenberg peeped 25 project feel 11 edition up online copy i
5 alice's updated project use restrictions the www carroll she edition project you stupid stupid feel
6 alice's 3 carroll in very english ' ' it with her fulcrum well updated is
7 alice's hole 0 making edition and well it and very adventures had sister in own
8 alice's in pleasure this was her the anywhere daisy updated restrictions whatsoever 11 0 release
9 alice's gutenberg no carroll ' or alice's ' millennium carroll she had ebook carroll june
10 alice's ebook december was use is june had is wonderland 2008 beginning 20 wonderland do
11 alice's 0 ebook 

In [8]:
in_words = "alice's"
for _ in range(maxlen):
 in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
 wordid = model.predict_classes(in_sequence, verbose=0)[0]
 for k, v in tokenizer.word_index.items():
 if v == wordid:
 in_words += ' ' + k
 break

print(in_words)

alice's carroll carroll carroll carroll carroll in in the the the the the the or
