In [1]:
# after https://github.com/hans/ipython-notebooks/blob/master/tf/TF%20tutorial.ipynb

In [2]:
import numpy as np
import tensorflow as tf

%matplotlib inline
import matplotlib.pyplot as plt

import tempfile
logdir = tempfile.mkdtemp()
print(logdir)

/tmp/tmpmn29ne8n


In [3]:
sess = tf.InteractiveSession()

In [4]:
seq_length = 5 # number of timesteps
batch_size = 64 
vocab_size = 7
embedding_size = 50
state_size = 100

In [5]:
# tensors are input as a list of size (number of timesteps)
enc_inp = [tf.placeholder(tf.int32, shape=(None,), name="inp%i" % t) for t in range(seq_length)]
labels = [tf.placeholder(tf.int32, shape=(None,), name="labels%i" % t) for t in range(seq_length)]
enc_inp

[,
 ,
 ,
 ,
 ]

In [6]:
weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]
weights

[,
 ,
 ,
 ,
 ]

In [7]:
# Decoder input: prepend some "GO" token and drop the final token of the encoder input
dec_inp = ([tf.zeros_like(enc_inp[0], dtype=np.int32, name="GO")] + enc_inp[:-1])
dec_inp

[,
 ,
 ,
 ,
 ]

In [8]:
# Initial memory value for recurrence.
prev_mem = tf.zeros((batch_size, state_size))
prev_mem



We can use different kinds of RNN cell for seq2seq.

In [30]:
cell = tf.nn.rnn_cell.GRUCell(state_size)


tf.nn.seq2seq.embedding_rnn_seq2seq:

> Embedding RNN sequence-to-sequence model.
 This model first embeds encoder_inputs by a newly created embedding (of shape
 [num_encoder_symbols x input_size]). Then it runs an RNN to encode
 embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs
 by another newly created embedding (of shape [num_decoder_symbols x
 input_size]). Then it runs RNN decoder, initialized with the last
 encoder state, on embedded decoder_inputs.

In [10]:
# inputs will be embedded, so have to specify maximum number of symbols that can appear (vocab_size)
dec_outputs, dec_state = tf.nn.seq2seq.embedding_rnn_seq2seq(
 enc_inp, dec_inp, cell, vocab_size, vocab_size, embedding_size)
dec_outputs, dec_state

([,
 ,
 ,
 ,
 ],
 )

In [11]:
loss = tf.nn.seq2seq.sequence_loss(dec_outputs, labels, weights, vocab_size)

In [12]:
tf.scalar_summary("loss", loss)



In [13]:
magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_state[1])))
tf.scalar_summary("magnitude at t=1", magnitude)



In [14]:
summary_op = tf.merge_all_summaries()

In [15]:
learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)

In [16]:
logdir = tempfile.mkdtemp()
print(logdir)
summary_writer = tf.train.SummaryWriter(logdir, sess.graph)

/tmp/tmpo_41xu3j


In [17]:
sess.run(tf.initialize_all_variables())

### Train network, step-by-step

Generate input:

In [18]:
X = [np.random.choice(vocab_size, size=(seq_length,), replace=False)
 for _ in range(batch_size)]
Y = X[:]
X[:5]

[array([3, 4, 2, 6, 5]),
 array([5, 3, 4, 6, 1]),
 array([4, 2, 6, 5, 1]),
 array([2, 4, 5, 6, 1]),
 array([5, 4, 1, 2, 3])]

In [19]:
X = np.array(X).T
Y = np.array(Y).T
X[:5]

array([[3, 5, 4, 2, 5, 3, 1, 2, 3, 3, 0, 6, 4, 0, 5, 1, 4, 4, 1, 1, 0, 3,
 4, 3, 6, 4, 4, 0, 0, 2, 6, 5, 3, 0, 5, 2, 4, 5, 1, 3, 5, 6, 2, 0,
 2, 3, 0, 6, 2, 6, 1, 1, 4, 5, 4, 0, 0, 2, 3, 0, 5, 2, 1, 0],
 [4, 3, 2, 4, 4, 1, 2, 3, 0, 1, 1, 4, 5, 2, 0, 5, 0, 2, 4, 6, 4, 5,
 6, 1, 2, 2, 3, 1, 5, 5, 2, 2, 5, 3, 6, 1, 5, 3, 5, 1, 0, 1, 1, 5,
 5, 0, 5, 2, 5, 4, 5, 6, 2, 6, 1, 2, 2, 4, 0, 2, 6, 1, 3, 3],
 [2, 4, 6, 5, 1, 0, 3, 1, 1, 2, 4, 0, 1, 4, 1, 6, 1, 3, 2, 5, 1, 4,
 3, 2, 1, 3, 2, 3, 6, 1, 1, 0, 4, 4, 2, 6, 1, 4, 0, 2, 1, 0, 3, 4,
 3, 6, 4, 4, 6, 5, 4, 5, 1, 4, 3, 5, 5, 0, 6, 1, 3, 5, 4, 4],
 [6, 6, 5, 6, 2, 6, 5, 5, 6, 5, 5, 2, 6, 6, 4, 4, 3, 1, 6, 2, 5, 6,
 0, 5, 0, 6, 0, 2, 4, 3, 3, 6, 2, 2, 3, 3, 3, 1, 6, 5, 4, 2, 4, 3,
 0, 2, 2, 5, 3, 0, 0, 4, 6, 3, 2, 6, 3, 5, 2, 5, 1, 3, 2, 1],
 [5, 1, 1, 1, 3, 4, 6, 4, 5, 4, 6, 1, 0, 3, 2, 0, 6, 6, 0, 4, 2, 2,
 2, 0, 3, 0, 6, 4, 1, 0, 4, 4, 1, 5, 1, 0, 0, 0, 4, 4, 2, 3, 0, 1,
 1, 4, 6, 1, 0, 2, 3, 3, 5, 1, 5, 3, 1, 1, 1, 4, 4, 6, 6, 5]])

In [20]:
[X[t] for t in range(seq_length)]

[array([3, 5, 4, 2, 5, 3, 1, 2, 3, 3, 0, 6, 4, 0, 5, 1, 4, 4, 1, 1, 0, 3, 4,
 3, 6, 4, 4, 0, 0, 2, 6, 5, 3, 0, 5, 2, 4, 5, 1, 3, 5, 6, 2, 0, 2, 3,
 0, 6, 2, 6, 1, 1, 4, 5, 4, 0, 0, 2, 3, 0, 5, 2, 1, 0]),
 array([4, 3, 2, 4, 4, 1, 2, 3, 0, 1, 1, 4, 5, 2, 0, 5, 0, 2, 4, 6, 4, 5, 6,
 1, 2, 2, 3, 1, 5, 5, 2, 2, 5, 3, 6, 1, 5, 3, 5, 1, 0, 1, 1, 5, 5, 0,
 5, 2, 5, 4, 5, 6, 2, 6, 1, 2, 2, 4, 0, 2, 6, 1, 3, 3]),
 array([2, 4, 6, 5, 1, 0, 3, 1, 1, 2, 4, 0, 1, 4, 1, 6, 1, 3, 2, 5, 1, 4, 3,
 2, 1, 3, 2, 3, 6, 1, 1, 0, 4, 4, 2, 6, 1, 4, 0, 2, 1, 0, 3, 4, 3, 6,
 4, 4, 6, 5, 4, 5, 1, 4, 3, 5, 5, 0, 6, 1, 3, 5, 4, 4]),
 array([6, 6, 5, 6, 2, 6, 5, 5, 6, 5, 5, 2, 6, 6, 4, 4, 3, 1, 6, 2, 5, 6, 0,
 5, 0, 6, 0, 2, 4, 3, 3, 6, 2, 2, 3, 3, 3, 1, 6, 5, 4, 2, 4, 3, 0, 2,
 2, 5, 3, 0, 0, 4, 6, 3, 2, 6, 3, 5, 2, 5, 1, 3, 2, 1]),
 array([5, 1, 1, 1, 3, 4, 6, 4, 5, 4, 6, 1, 0, 3, 2, 0, 6, 6, 0, 4, 2, 2, 2,
 0, 3, 0, 6, 4, 1, 0, 4, 4, 1, 5, 1, 0, 0, 0, 4, 4, 2, 3, 0, 1, 1, 4,
 6, 1, 0, 2, 3, 3, 5, 1, 5, 3, 1, 1, 

Feed input:

In [21]:
feed_dict = {enc_inp[t]: X[t] for t in range(seq_length)}

In [22]:
feed_dict.update({labels[t]: Y[t] for t in range(seq_length)})

One training step:

In [23]:
_, loss_t, summary = sess.run([train_op, loss, summary_op], feed_dict)
loss_t, summary

(1.9546013,
 b'\n\x0b\n\x04loss\x15`0\xfa?\n\x17\n\x10magnitude at t=1\x15/\xc5\x8a?')

### Test case

In [24]:
X_batch = [np.random.choice(vocab_size, size=(seq_length,), replace=False)
 for _ in range(10)]
X_batch

[array([5, 0, 4, 2, 1]),
 array([1, 2, 5, 3, 0]),
 array([3, 0, 5, 1, 4]),
 array([6, 3, 5, 2, 1]),
 array([6, 4, 1, 5, 2]),
 array([1, 5, 3, 0, 6]),
 array([4, 6, 0, 2, 3]),
 array([6, 4, 0, 5, 1]),
 array([0, 2, 1, 6, 4]),
 array([4, 3, 1, 5, 0])]

In [25]:
X_batch = np.array(X_batch).T
X_batch

array([[5, 1, 3, 6, 6, 1, 4, 6, 0, 4],
 [0, 2, 0, 3, 4, 5, 6, 4, 2, 3],
 [4, 5, 5, 5, 1, 3, 0, 0, 1, 1],
 [2, 3, 1, 2, 5, 0, 2, 5, 6, 5],
 [1, 0, 4, 1, 2, 6, 3, 1, 4, 0]])

In [26]:
feed_dict = {enc_inp[t]: X_batch[t] for t in range(seq_length)}
dec_outputs_batch = sess.run(dec_outputs, feed_dict)
dec_outputs_batch

[array([[ 0.17844655, 0.11353172, -0.18144946, 0.01085662, 0.2627669 ,
 -0.19861746, -0.08941379],
 [ 0.21098135, -0.08187042, 0.01305156, -0.05247057, 0.07250485,
 -0.23508257, -0.2811504 ],
 [ 0.13883567, 0.09521072, -0.11679886, 0.00680578, 0.16948652,
 -0.12717277, -0.0371674 ],
 [ 0.21993272, 0.00940304, -0.16575341, -0.16260277, 0.1767858 ,
 -0.18685289, -0.08371995],
 [ 0.10060885, 0.01945375, -0.21006839, -0.14021412, -0.01953002,
 -0.18472196, -0.08986312],
 [ 0.17035802, -0.07102758, 0.0302645 , -0.25397092, -0.00265401,
 -0.12531452, -0.2037717 ],
 [ 0.24822755, -0.21595982, 0.00301015, -0.27651039, 0.10579222,
 -0.24226177, -0.21743613],
 [ 0.27941915, 0.13427791, -0.13733003, -0.05824878, 0.17526907,
 -0.10152674, -0.1523411 ],
 [ 0.04663948, -0.0393581 , -0.08266062, -0.11853192, 0.05972272,
 -0.11571028, -0.0639744 ],
 [ 0.18015689, -0.02721097, -0.00587304, -0.02247566, 0.12168466,
 -0.14741473, -0.15037411]], dtype=float32),
 array([[ 0.10395906, 0.18637255, -0.1152041

In [27]:
[logits_t.argmax(axis=1) for logits_t in dec_outputs_batch]

[array([4, 0, 4, 0, 0, 0, 0, 0, 4, 0]),
 array([1, 0, 0, 4, 0, 0, 4, 0, 4, 4]),
 array([4, 1, 4, 4, 4, 0, 4, 4, 2, 0]),
 array([4, 1, 1, 1, 6, 1, 4, 4, 2, 0]),
 array([1, 1, 5, 1, 6, 3, 2, 0, 2, 6])]

### training function

In [28]:
def train_batch(batch_size):
 X = [np.random.choice(vocab_size, size=(seq_length,), replace=False) for _ in range(batch_size)]
 Y = X[:]
 
 # Dimshuffle to seq_len * batch_size
 X = np.array(X).T
 Y = np.array(Y).T

 feed_dict = {enc_inp[t]: X[t] for t in range(seq_length)}
 feed_dict.update({labels[t]: Y[t] for t in range(seq_length)})

 _, loss_t, summary = sess.run([train_op, loss, summary_op], feed_dict)
 return loss_t, summary

In [29]:
for t in range(500):
 loss_t, summary = train_batch(batch_size)
 summary_writer.add_summary(summary, t)
summary_writer.flush()