# A Network Tour of Data Science
### &nbsp; &nbsp; &nbsp; Xavier Bresson, Winter 2016/17
## Assignment 3 : Recurrent Neural Networks

In [None]:
# Import libraries
import tensorflow as tf
import numpy as np
import collections
import os

In [None]:
# Load text data
data = open(os.path.join('datasets', 'text_ass_6.txt'), 'r').read() # must be simple plain text file
print('Text data:',data)
chars = list(set(data))
print('\nSingle characters:',chars)
data_len, vocab_size = len(data), len(chars)
print('\nText data has %d characters, %d unique.' % (data_len, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
print('\nMapping characters to numbers:',char_to_ix)
print('\nMapping numbers to characters:',ix_to_char)

# Goal 
The goal is to define with TensorFlow a vanilla recurrent neural network (RNN) model:

$$
\begin{aligned}
h_t &= \textrm{tanh}(W_h h_{t-1} + W_x x_t + b_h)\\
y_t &= W_y h_t + b_y
\end{aligned}
$$


to predict a sequence of characters. $x_t \in \mathbb{R}^D$ is the input character of the RNN in a dictionary of size $D$. $y_t \in \mathbb{R}^D$ is the predicted character (through a distribution function) by the RNN system. $h_t \in \mathbb{R}^H$ is the memory of the RNN, called hidden state at time $t$. Its dimensionality is arbitrarly chosen to $H$. The variables of the system are $W_h \in \mathbb{R}^{H\times H}$, $W_x \in \mathbb{R}^{H\times D}$, $W_y \in \mathbb{R}^{D\times H}$, $b_h \in \mathbb{R}^D$, and $b_y \in \mathbb{R}^D$. <br>

The number of time steps of the RNN is $T$, that is we will learn a sequence of data of length $T$: $x_t$ for $t=0,...,T-1$.

In [None]:
# hyperparameters of RNN
batch_size = 3                                  # batch size
batch_len = data_len // batch_size              # batch length
T = 5                                           # temporal length
epoch_size = (batch_len - 1) // T               # nb of iterations to get one epoch
D = vocab_size                                  # data dimension = nb of unique characters
H = 5*D                                         # size of hidden state, the memory layer

print('data_len=',data_len,' batch_size=',batch_size,' batch_len=',
      batch_len,' T=',T,' epoch_size=',epoch_size,' D=',D)

# Step 1 
Initialize input variables of the computational graph:<br>
(1) Xin of size *batch_size x T x D* and type *tf.float32*. Each input character is encoded on a vector of size D.<br>
(2) Ytarget of size *batch_size x T* and type *tf.int64*. Each target character is encoded by a value in {0,...,D-1}.<br>
(3) hin of size *batch_size x H* and type *tf.float32*<br>

In [None]:
# input variables of computational graph (CG)
YOUR CODE HERE

# Step 2
Define the variables of the computational graph:<br>
(1) $W_x$ is a random variable of shape *D x H* with normal distribution of variance $\frac{6}{D+H}$<br>
(2) $W_h$ is an identity matrix multiplies by constant $0.01$<br>
(3) $W_y$ is a random variable of shape *H x D* with normal distribution of variance $\frac{6}{D+H}$<br>
(4) $b_h$, $b_y$ are zero vectors of size *H*, and *D*<br>

In [None]:
# Model variables
YOUR CODE HERE

# Step 3
Implement the recursive formula:

$$
\begin{aligned}
h_t &= \textrm{tanh}(W_h h_{t-1} + W_x x_t + b_h)\\
y_t &= W_y h_t + b_y
\end{aligned}
$$

with $h_{t=0}=hin$.<br>

Hints: <br> 
(1) You may use functions *tf.split()*, *enumerate()*, *tf.squeeze()*, *tf.matmul()*, *tf.tanh()*, *tf.transpose()*, *append()*, *pack()*.<br>
(2) You may use a matrix Y of shape *batch_size x T x D*. We recall that Ytarget should have the shape *batch_size x T*.<br>


In [None]:
# Vanilla RNN implementation
Y = []
ht = hin

YOUR CODE HERE

print('Y=',Y.get_shape())
print('Ytarget=',Ytarget.get_shape())

# Step 4
Perplexity loss is implemented as:

In [None]:
# perplexity
logits = tf.reshape(Y,[batch_size*T,D])
weights = tf.ones([batch_size*T])
cross_entropy_perplexity = tf.nn.seq2seq.sequence_loss_by_example([logits],[Ytarget],[weights])
cross_entropy_perplexity = tf.reduce_sum(cross_entropy_perplexity) / batch_size
loss = cross_entropy_perplexity

# Step 5
Implement the optimization of the loss function.

Hint: You may use function *tf.train.GradientDescentOptimizer()*.


In [None]:
# Optimization
YOUR CODE HERE

# Step 6
Implement the prediction scheme: from an input character e.g. "h" then the RNN should predict "ello". <br>

Hints: <br> 
(1) You should use the learned RNN.<br>
(2) You may use functions *tf.one_hot()*, *tf.nn.softmax()*, *tf.argmax()*.


In [None]:
# Predict
idx_pred = tf.placeholder(tf.int64) # input seed

YOUR CODE HERE

Ypred = tf.convert_to_tensor(Ypred)

In [None]:
# Prepare train data matrix of size "batch_size x batch_len"
data_ix = [char_to_ix[ch] for ch in data[:data_len]]
train_data = np.array(data_ix)
print('original train set shape',train_data.shape)
train_data = np.reshape(train_data[:batch_size*batch_len], [batch_size,batch_len])
print('pre-processed train set shape',train_data.shape)

In [None]:
# The following function tansforms an integer value d between {0,...,D-1} into an one hot vector, that is a 
# vector of dimension D x 1 which has value 1 for index d-1, and 0 otherwise
from scipy.sparse import coo_matrix
def convert_to_one_hot(a,max_val=None):
    N = a.size
    data = np.ones(N,dtype=int)
    sparse_out = coo_matrix((data,(np.arange(N),a.ravel())), shape=(N,max_val))
    return np.array(sparse_out.todense())

# Step 7
Run the computational graph with batches of training data.<br> 
Predict the sequence of characters starting from the character "h".<br> 

Hints:<br>
(1) Initial memory is $h_{t=0}$ is 0.<br>
(2) Run the computational graph to optimize the perplexity loss, and to predict the the sequence of characters starting from the character "h".<br>

In [None]:
# Run CG
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)

h0 = np.zeros([batch_size,H])
indices = collections.deque()
costs = 0.0; epoch_iters = 0
for n in range(50):
    
    # Batch extraction
    if len(indices) < 1:
        indices.extend(range(epoch_size))
        costs = 0.0; epoch_iters = 0
    i = indices.popleft() 
    batch_x = train_data[:,i*T:(i+1)*T]
    batch_x = convert_to_one_hot(batch_x,D); batch_x = np.reshape(batch_x,[batch_size,T,D])
    batch_y = train_data[:,i*T+1:(i+1)*T+1]
    #print(batch_x.shape,batch_y.shape)

    # Train
    idx = char_to_ix['h'];
    loss_value,_,Ypredicted = sess.run(YOUR CODE HERE)
   
    # Perplexity
    costs += loss_value
    epoch_iters += T
    perplexity = np.exp(costs/epoch_iters)
    
    if not n%1:
        idx_char = Ypredicted
        txt = ''.join(ix_to_char[ix] for ix in list(idx_char))
        print('\nn=',n,', perplexity value=',perplexity)
        print('starting char=',ix_to_char[idx], ', predicted sequences=',txt)
    
sess.close()    