"Open

In [0]:
import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe


tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField()
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_data)
word_embeddings = TEXT.vocab.vectors
print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
print ("Label Length: " + str(len(LABEL.vocab)))
train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
'''Alternatively we can also use the default configurations'''
#train_iter_, test_iter_ = datasets.IMDB.iters(batch_size=32)
vocab_size = len(TEXT.vocab)
#return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter

Length of Text Vocabulary: 251639
Vector size of Text Vocabulary: torch.Size([251639, 300])
Label Length: 2


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Optimization project")
os.getcwd()

file_path = "/content/drive/My Drive/Colab Notebooks/Optimization project/IMDB"
#directory = os.path.dirname(file_path)

try:
 os.stat(file_path)
except:
 os.mkdir(file_path) 

In [0]:
import sug
from sug import SUG

In [0]:
import torch
from torch.optim import Optimizer
import math
import copy

class SUG(Optimizer):
 def __init__(self, params, l_0, d_0=0, prob=1., eps=1e-4, momentum=0, dampening=0,
 weight_decay=0, nesterov=False):
 if l_0 < 0.0:
 raise ValueError("Invalid Lipsitz constant of gradient: {}".format(l_0))
 if d_0 < 0.0:
 raise ValueError("Invalid disperion of gradient: {}".format(d_0))
 if momentum < 0.0:
 raise ValueError("Invalid momentum value: {}".format(momentum))
 if weight_decay < 0.0:
 raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

 defaults = dict(L=l_0, momentum=momentum, dampening=dampening,
 weight_decay=weight_decay, nesterov=nesterov)
 if nesterov and (momentum <= 0 or dampening != 0):
 raise ValueError("Nesterov momentum requires a momentum and zero dampening")
 self.Lips = l_0
 self.prev_Lips = l_0
 self.D_0 = d_0
 self.eps = eps
 self.prob = prob
 self.start_param = params
 self.upd_sq_grad_norm = None
 self.sq_grad_norm = None
 self.loss = torch.tensor(0.)
 self.cur_loss = 0
 self.closure = None
 super(SUG, self).__init__(params, defaults)

 def __setstate__(self, state):
 super(SUG, self).__setstate__(state)
 for group in self.param_groups:
 group.setdefault('nesterov', False)

 def comp_batch_size(self):
 """Returns optimal batch size for given d_0, eps and l_0;

 """
 return math.ceil(2 * self.D_0 * self.eps / self.prev_Lips)

 def step(self, loss, closure):
 """Performs a single optimization step.

 Arguments:
 loss : current loss

 closure (callable, optional): A closure that reevaluates the model
 and returns the loss.
 """
 self.start_params = []
 self.loss = loss
 self.sq_grad_norm = 0
 self.cur_loss = loss
 self.closure = closure
 for gr_idx, group in enumerate(self.param_groups):
 weight_decay = group['weight_decay']
 momentum = group['momentum']
 dampening = group['dampening']
 nesterov = group['nesterov']
 self.start_params.append([])
 for p_idx, p in enumerate(group['params']):
 self.start_params[gr_idx].append([p.data.clone()])
 if p.grad is None:
 continue
 self.start_params[gr_idx][p_idx].append(p.grad.data.clone())
 d_p = self.start_params[gr_idx][p_idx][1]
 p_ = self.start_params[gr_idx][p_idx][0]
 
 
 if weight_decay != 0:
 d_p.add_(weight_decay, p.data)
 self.cur_loss += weight_decay * torch.sum(p * p).item()
 
 
 self.sq_grad_norm += torch.sum(d_p * d_p).item()
 
 if momentum != 0:
 param_state = self.state[p]
 if 'momentum_buffer' not in param_state:
 buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
 buf.mul_(momentum).add_(d_p)
 else:
 buf = param_state['momentum_buffer']
 buf.mul_(momentum).add_(1 - dampening, d_p)
 if nesterov:
 d_p = d_p.add(momentum, buf)
 else:
 d_p = buf
 self.start_params[gr_idx][p_idx][1] = d_p
 
 i = 0
 self.Lips = max(self.prev_Lips / 2, 0.1)
 difference = -1
 while difference < 0 or i == 0:
 if (i > 0): 
 self.Lips = max(self.Lips * 2, 0.1)
 for gr_idx, group in enumerate(self.param_groups):
 for p_idx, p in enumerate(group['params']):
 if p.grad is None:
 continue
 start_param_val = self.start_params[gr_idx][p_idx][0]
 start_param_grad = self.start_params[gr_idx][p_idx][1]
 p.data = start_param_val - 1/(2*self.Lips) * start_param_grad
 difference, upd_loss = self.stop_criteria()
 i += 1
 self.prev_Lips = self.Lips

 return self.Lips, i

 def stop_criteria(self):
 """Checks if the Lipsitz constant of gradient is appropriate
 
 + 2L_k / 2 ||x_k - w_k||^2 = - 1 / (2L_k)||g(x_k)||^2 + 1 / (4L_k)||g(x_k)||^2 = -1 / (4L_k)||g(x_k)||^2 
 """
 upd_loss = self.closure()
 major = self.cur_loss - 1 / (4 * self.Lips) * self.sq_grad_norm
 return major - upd_loss - self.l2_reg() + self.eps / 10, upd_loss

 def get_lipsitz_const(self):
 """Returns current Lipsitz constant of the gradient of the loss function
 """
 return self.Lips
 
 def get_sq_grad(self):
 """Returns the current second norm of the gradient of the loss function 
 calculated by the formula
 
 ||f'(p_1,...,p_n)||_2^2 ~ \sum\limits_{i=1}^n ((df/dp_i) * (df/dp_i))(p1,...,p_n))
 
 """
 self.upd_sq_grad_norm = 0
 for gr_idx, group in enumerate(self.param_groups):
 for p_idx, p in enumerate(group['params']):
 if p.grad is None:
 continue
 self.upd_sq_grad_norm += torch.sum(p.grad.data * p.grad.data).item()
 
 return self.upd_sq_grad_norm
 
 def l2_reg(self):
 """Returns the current l2 regularization addiction
 
 """
 self.upd_l2_reg = 0
 for gr_idx, group in enumerate(self.param_groups):
 weight_decay = group['weight_decay']
 if weight_decay != 0:
 for p_idx, p in enumerate(group['params']):
 self.upd_l2_reg += weight_decay * torch.sum(p * p).item()
 
 return self.upd_l2_reg

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device

device(type='cuda', index=0)

## Model

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [0]:
class SimpleLSTMBaseline(nn.Module):
 def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
 super().__init__() 
 self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
 self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
 
 self.linear1 = nn.Linear(2 * hidden_dim, 32)
 self.linear1.weight.data.fill_(2)
 self.linear2 = nn.Linear(32, 2)
 self.linear2.weight.data.fill_(2)

 
 def forward(self, seq, lens):
 embeds = self.embedding(seq)
 packed = pack_padded_sequence(embeds, lens, batch_first=True)
 hdn, _ = self.encoder(packed)
 hdn, _ = pad_packed_sequence(hdn, batch_first=True)
 output = nn.functional.max_pool1d(hdn, kernel_size=10)
 output = nn.functional.relu(self.linear1(hdn[:,1,:]))
 prob = nn.functional.log_softmax(self.linear2(output), -1)
 
 return prob
 

In [0]:
class LSTMClassifier(nn.Module):
 def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
 super(LSTMClassifier, self).__init__()
 
 """
 Arguments
 ---------
 batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
 output_size : 2 = (pos, neg)
 hidden_sie : Size of the hidden_state of the LSTM
 vocab_size : Size of the vocabulary containing unique words
 embedding_length : Embeddding dimension of GloVe word embeddings
 weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
 
 """
 
 self.batch_size = batch_size
 self.output_size = output_size
 self.hidden_size = hidden_size
 self.vocab_size = vocab_size
 self.embedding_length = embedding_length
 self.num_layers = 1
 
 self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
 self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
 self.lstm = nn.LSTM(embedding_length, hidden_size, batch_first=True, bidirectional=False, num_layers=self.num_layers)
 self.label = nn.Linear(1 * hidden_size * self.num_layers, output_size)
 
 def forward(self, input_sentence, batch_size=None):

 """ 
 Parameters
 ----------
 input_sentence: input_sentence of shape = (batch_size, num_sequences)
 batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
 
 Returns
 -------
 Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
 final_output.shape = (batch_size, output_size)
 
 """
 
 ''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
 input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length)
 #input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
 batch_size = input_sentence.size(0)
 h_0 = Variable(torch.zeros(1 * self.num_layers, batch_size, self.hidden_size).cuda())
 c_0 = Variable(torch.zeros(1 * self.num_layers, batch_size, self.hidden_size).cuda())
 #packed = pack_padded_sequence(input, lens, batch_first=True)
 #output, (final_hidden_state, final_cell_state) = self.lstm(packed, (h_0, c_0))
 #output, _ = pad_packed_sequence(output, batch_first=True)
 #print(input.size())
 output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
 final_output = self.label(final_hidden_state.view(batch_size, self.num_layers*1*self.hidden_size)) # final_hidden_state.size() = (2, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)
 
 return final_output

In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

class AttentionModel(torch.nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(AttentionModel, self).__init__()
		
		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
		
		--------
		
		"""
		
		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)
		#self.attn_fc_layer = nn.Linear()
		
	def attention_net(self, lstm_output, final_state):

		""" 
		Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding
		between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication.
		
		Arguments
		---------
		
		lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence.
		final_state : Final time-step hidden state (h_n) of the LSTM
		
		---------
		
		Returns : It performs attention mechanism by first computing weights for each of the sequence present in lstm_output and and then finally computing the
				 new hidden state.
				 
		Tensor Size :
					hidden.size() = (batch_size, hidden_size)
					attn_weights.size() = (batch_size, num_seq)
					soft_attn_weights.size() = (batch_size, num_seq)
					new_hidden_state.size() = (batch_size, hidden_size)
					 
		"""
		
		hidden = final_state.squeeze(0)
		attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
		soft_attn_weights = F.softmax(attn_weights, 1)
		new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
		
		return new_hidden_state
	
	def forward(self, input_sentences, batch_size=None):
	
		""" 
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network.
		final_output.shape = (batch_size, output_size)
		
		"""
		batch_size = input_sentences.size(0)
		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
		c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size) 
		output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size)
		
		attn_output = self.attention_net(output, final_hidden_state)
		logits = self.label(attn_output)
		
		return nn.functional.log_softmax(logits, -1)

## Training

In [0]:
import time
import math

def time_since(since):
 s = time.time() - since
 m = math.floor(s / 60)
 s -= m * 60
 return '%dm %ds' % (m, s)


def model_step(model, optimizer, criterion, inputs, labels):
 outputs = model(inputs)
 loss = criterion(outputs, labels)
 acc = (torch.argmax(outputs, 1) == labels).float().sum().item()
 if model.training:
 optimizer.zero_grad()
 loss.backward(retain_graph=True)
 if optimizer.__class__.__name__ != 'SUG':
 optimizer.step()
 else:
 def closure():
 optimizer.zero_grad()
 upd_outputs = model(inputs)
 upd_loss = criterion(upd_outputs, labels).item()

 return upd_loss

 optimizer.step(loss.item(), closure)

 return loss.item(), acc

In [0]:
def train(model, trainloader, criterion, optimizer, path=None, n_epochs=2, validloader=None, eps=1e-5, print_every=1):
 tr_loss, val_loss, lips, times, grad, tr_acc, val_acc = ([] for i in range(7))
 start_time = time.time()
 model.to(device=device)
 print(len(list(trainloader)))
 for ep in range(n_epochs):
 model.train()
 i = 0
 tot_acc = 0
 n_ex = 0
 for i, batch in enumerate(trainloader):
 #t, l = batch
 #(text, lens), target = t
 text = batch.text[0]
 lens = batch.text[1]
 target = batch.label
 target = torch.autograd.Variable(target).long()
 if torch.cuda.is_available():
 text = text.cuda()
 target = target.cuda()
 loss, acc = model_step(model, optimizer, criterion, text, target)
 tot_acc += acc
 n_ex += text.size(0)
 tr_loss.append(loss) 
 
 if optimizer.__class__.__name__ == 'SUG':
 lips.append(optimizer.get_lipsitz_const())
 grad.append(optimizer.get_sq_grad)
 if i % 100 == 0:
 print(tr_loss[-1], i)
 times.append(time_since(start_time))
 model.zero_grad()
 optimizer.zero_grad()
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss,
 'val_loss' : val_loss,
 'lips' : lips,
 'grad' : grad,
 'times' : times
 } 
 if path is not None:
 torch.save(states, path)
 tr_acc.append(tot_acc / n_ex)
 times.append(time_since(start_time))
 if ep % print_every == 0:
 print("Epoch {}, training loss {}, time passed {}, training accuracy {}".format(ep, sum(tr_loss[-i:]) / i, time_since(start_time), tr_acc[-1]))

 if validloader is None:
 continue
 model.zero_grad()
 model.eval()
 j = 0
 count = 0
 n_ex = 0
 for j, batch in enumerate(validloader):
 text = batch.text[0]
 target = batch.label
 target = torch.autograd.Variable(target).long()
 if torch.cuda.is_available():
 text = text.cuda()
 target = target.cuda()
 outputs = model(text)
 #outputs_lab = torch.argmax(outputs, 1)
 count += (torch.argmax(outputs, 1) == target).float().sum().item()
 n_ex += outputs.size(0) 
 val_loss.append(criterion(outputs, target).item())
 val_acc.append(count / n_ex)
 if ep % print_every == 0:
 print("Validation loss {}, validation accuracy {}".format(sum(val_loss[-j:]) / j, val_acc[-1]))
 
 return tr_loss, times, val_loss, lips, grad, tr_acc, val_acc

In [0]:
def concat_states(state1, state2):
 states = {
 'epoch': state1['epoch'] + state2['epoch'],
 'state_dict': state2['state_dict'],
 'optimizer': state2['optimizer'],
 'tr_loss' : state1['tr_loss'] + state2['tr_loss'],
 'val_loss' : state1['val_loss'] + state2['val_loss'],
 'lips' : state1['lips'] + state2['lips'],
 'grad' : state1['grad'] + state2['grad'],
 #'times' : state1['times'] + list(map(lambda x: x + state1['times'][-1],state2['times']))
 'times' : state1['times'] + state2['times']
 }
 return states

In [0]:
print_every = 1
n_epochs = 10
tr_loss = {}
tr_loss['sgd'] = {}
val_loss = {}
val_loss['sgd'] = {}
#lrs = [0.05, 0.01, 0.005]
em_sz = 128
hidden_size = 256
embedding_length = 300
nl = 2
torch.manual_seed(999)
batch_size = 32
criterion = nn.CrossEntropyLoss()

In [0]:
n_epochs = 20

In [0]:
vocab_size = int(vocab_size)
vocab_size

251639

In [0]:
lrs = [0.0001, 0.001]
for lr in lrs:
 model = LSTMClassifier(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/LSTM_' + str(lr))

SGD lr=0.0001, momentum=0. :
547
0.6964250206947327 0
0.694038987159729 100
0.690762996673584 200
0.6946480870246887 300
0.6906776428222656 400
0.6938506364822388 500
Epoch 0, training loss 0.6931881379513514, time passed 0m 17s, training accuracy 0.5025714285714286
Validation loss 0.6940929013439733, validation accuracy 0.4856
0.6945292353630066 0
0.69283527135849 100
0.6948346495628357 200
0.6958023905754089 300
0.688003420829773 400
0.6869951486587524 500
Epoch 1, training loss 0.6931847904846345, time passed 0m 26s, training accuracy 0.5028
Validation loss 0.694080766194906, validation accuracy 0.48546666666666666
0.6985771656036377 0
0.6874481439590454 100
0.6930689215660095 200
0.6923878192901611 300
0.693231463432312 400
0.693196713924408 500
Epoch 2, training loss 0.6931726265521276, time passed 0m 36s, training accuracy 0.5028571428571429
Validation loss 0.6940687321699582, validation accuracy 0.486
0.6972582340240479 0
0.690631628036499 100
0.6858733892440796 200
0.6978160142

In [0]:
l_0 = 20
model = LSTMClassifier(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
print("SUG l_0={}, momentum=0. :".format(l_0))
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.)
tr_loss['sug'], times, val_loss['sug'], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sug'],
 'val_loss' : val_loss['sug'],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
torch.save(states, './IMDB/LSTM_sug')

SUG l_0=20, momentum=0. :
547
0.6849241256713867 0


 self.dropout, self.training, self.bidirectional, self.batch_first)


0.6952140927314758 100
0.7481666803359985 200
0.6844102144241333 300
0.6859437227249146 400
0.7077557444572449 500
Epoch 0, training loss 0.7053052665113093, time passed 0m 20s, training accuracy 0.5095428571428572
Validation loss 0.7462471091849172, validation accuracy 0.5014666666666666
0.7905590534210205 0
0.6987895965576172 100
0.7223014831542969 200
0.7111559510231018 300
0.742397665977478 400
0.6616271138191223 500
Epoch 1, training loss 0.7009164982444638, time passed 0m 38s, training accuracy 0.5201714285714286
Validation loss 0.6874387786429152, validation accuracy 0.5314666666666666
0.6852426528930664 0
0.7327583432197571 100
0.6913496851921082 200
0.6588264107704163 300
0.5595537424087524 400
0.5827333927154541 500
Epoch 2, training loss 0.6299851954856635, time passed 0m 56s, training accuracy 0.6525714285714286
Validation loss 0.5988702120689245, validation accuracy 0.7065333333333333
0.5051348209381104 0
0.6458197236061096 100
0.6327732801437378 200
0.6482389569282532 300

In [0]:
lrs = [0.01]
for lr in lrs:
 model = LSTMClassifier(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/LSTM_' + str(lr))

SGD lr=0.01, momentum=0. :
547
0.6858300566673279 0
0.6929423213005066 100
0.6985231041908264 200
0.7083036303520203 300
0.7047500014305115 400
0.7050995826721191 500
Epoch 0, training loss 0.6928785365798098, time passed 0m 9s, training accuracy 0.5070285714285714
Validation loss 0.6905961920562972, validation accuracy 0.5312
0.688129186630249 0
0.6874046921730042 100
0.691518247127533 200
0.6922094225883484 300
0.7087409496307373 400
0.6636379361152649 500
Epoch 1, training loss 0.691004237521699, time passed 0m 20s, training accuracy 0.5172
Validation loss 0.6904787467076228, validation accuracy 0.5094666666666666
0.6732712984085083 0
0.6795019507408142 100
0.7093783617019653 200
0.6702148914337158 300
0.6914629340171814 400
0.679591715335846 500
Epoch 2, training loss 0.6890987443836617, time passed 0m 30s, training accuracy 0.5206285714285714
Validation loss 0.6867884596188863, validation accuracy 0.5437333333333333
0.67351233959198 0
0.7418563961982727 100
0.6954500675201416 200


In [0]:
lrs = [0.001]
for lr in lrs:
 model = LSTMClassifier(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.Adam(model.parameters(), lr=lr)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/LSTM_adam_' + str(lr))

SGD lr=0.001, momentum=0. :
547
0.7042360901832581 0
0.7139124274253845 100
0.648885190486908 200
0.6860226392745972 300
0.685365617275238 400
0.6789942383766174 500
Epoch 0, training loss 0.6891226628761151, time passed 0m 10s, training accuracy 0.5336
Validation loss 0.6916678978337182, validation accuracy 0.5101333333333333
0.684501051902771 0
0.6671040058135986 100
0.6860296130180359 200
0.6729943156242371 300
0.6892271041870117 400
0.7105855941772461 500
Epoch 1, training loss 0.6802897323946376, time passed 0m 22s, training accuracy 0.5594857142857143
Validation loss 0.6862392445914766, validation accuracy 0.5221333333333333
0.7073076963424683 0
0.689601480960846 100
0.5204960703849792 200
0.688227653503418 300
0.6900652647018433 400
0.7155442833900452 500
Epoch 2, training loss 0.6715143832204106, time passed 0m 33s, training accuracy 0.5706285714285714
Validation loss 0.6796312681120685, validation accuracy 0.532
0.6642115116119385 0
0.6051458120346069 100
0.4899091422557831 20

In [0]:
lrs = [0.0001, 0.001]
for lr in lrs:
 model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/attn_' + str(lr))

SGD lr=0.0001, momentum=0. :
547
0.6913896799087524 0
0.6833416819572449 100
0.6810654401779175 200
0.6995826363563538 300
0.6945667862892151 400
0.6890664100646973 500
Epoch 0, training loss 0.6941597445325537, time passed 0m 16s, training accuracy 0.5016571428571429
Validation loss 0.6946967735759213, validation accuracy 0.49093333333333333
0.703650951385498 0
0.6967843174934387 100
0.6908212304115295 200
0.6907294392585754 300
0.6946154832839966 400
0.6943396925926208 500
Epoch 1, training loss 0.693790737307552, time passed 0m 32s, training accuracy 0.5012
Validation loss 0.6941390972361605, validation accuracy 0.4916
0.6794251203536987 0
0.7008486390113831 100
0.6980838775634766 200
0.6835271120071411 300
0.6840112805366516 400
0.696955680847168 500
Epoch 2, training loss 0.6937036286125253, time passed 0m 48s, training accuracy 0.5015428571428572
Validation loss 0.6938381541488517, validation accuracy 0.49173333333333336
0.6919559836387634 0
0.6883322596549988 100
0.6951529979705

In [0]:
l_0 = 20
model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
print("SUG l_0={}, momentum=0. :".format(l_0))
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.)
tr_loss['sug'], times, val_loss['sug'], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sug'],
 'val_loss' : val_loss['sug'],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
torch.save(states, './IMDB/attn_sug')

SUG l_0=20, momentum=0. :
547
0.6829845309257507 0


 self.dropout, self.training, self.bidirectional, self.batch_first)


0.7011003494262695 100
0.6893718242645264 200
0.7490146160125732 300
0.6662403345108032 400
0.7302882075309753 500
Epoch 0, training loss 0.7087923813433874, time passed 0m 34s, training accuracy 0.524
Validation loss 0.6786875987154806, validation accuracy 0.5837333333333333
0.6089674234390259 0
0.5822049975395203 100
0.5361784100532532 200
0.48254698514938354 300
0.5182648301124573 400
0.5892543792724609 500
Epoch 1, training loss 0.5938645527585522, time passed 1m 8s, training accuracy 0.6910285714285714
Validation loss 0.5369184652709553, validation accuracy 0.7398666666666667
0.4637317359447479 0
0.49282562732696533 100
0.7132331728935242 200
0.37981486320495605 300
0.4586241543292999 400
0.5515133142471313 500
Epoch 2, training loss 0.473880006089097, time passed 1m 43s, training accuracy 0.7809142857142857
Validation loss 0.5159184689450468, validation accuracy 0.7656
0.4248925447463989 0
0.2927207946777344 100
0.3366227447986603 200
0.4807620048522949 300
0.45819780230522156 40

In [0]:
lrs = [0.0001]
for lr in lrs:
 model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.Adam(model.parameters(), lr=lr)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/attn_adam_' + str(lr))

SGD lr=0.0001, momentum=0. :
547
0.7074708938598633 0
0.699157178401947 100
0.6875941157341003 200
0.6936682462692261 300
0.6861938238143921 400
0.693611741065979 500
Epoch 0, training loss 0.6934679332888607, time passed 0m 25s, training accuracy 0.5128571428571429
Validation loss 0.6908549699518416, validation accuracy 0.5306666666666666
0.6809442043304443 0
0.7160238027572632 100
0.7025774121284485 200
0.6726558804512024 300
0.6648744940757751 400
0.7097936868667603 500
Epoch 1, training loss 0.6831305411272434, time passed 0m 52s, training accuracy 0.548
Validation loss 0.6688847951909416, validation accuracy 0.5654666666666667
0.7096719145774841 0
0.6852087378501892 100
0.6141272783279419 200
0.5822790265083313 300
0.5733252763748169 400
0.4738878607749939 500
Epoch 2, training loss 0.6086636586831167, time passed 1m 18s, training accuracy 0.6895428571428571
Validation loss 0.5956426103655089, validation accuracy 0.7068
0.572991132736206 0
0.5878808498382568 100
0.3920255303382873

In [0]:
l_0 = 20
model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
print("SUG l_0={}, momentum=0. :".format(l_0))
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.9, weight_decay=1e-3)
tr_loss['sug'], times, val_loss['sug'], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sug'],
 'val_loss' : val_loss['sug'],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
torch.save(states, './IMDB/attn_sug_0.9')

SUG l_0=20, momentum=0. :
547


 self.dropout, self.training, self.bidirectional, self.batch_first)


0.6980208158493042 0
0.6874722242355347 100
0.6817547678947449 200
0.6828298568725586 300
0.7083317637443542 400
0.7124193906784058 500
Epoch 0, training loss 0.6966337240659274, time passed 0m 40s, training accuracy 0.4993714285714286
Validation loss 0.6994800914047111, validation accuracy 0.4866666666666667
0.6685550808906555 0
0.6823179721832275 100
0.7071587443351746 200
0.6757057905197144 300
0.6761490702629089 400
0.6832512617111206 500
Epoch 1, training loss 0.6966324601417933, time passed 1m 21s, training accuracy 0.4993142857142857
Validation loss 0.6993934662423582, validation accuracy 0.4866666666666667
0.7265142202377319 0
0.6936051845550537 100
0.6723883152008057 200
0.6989526748657227 300
0.6900702118873596 400
0.7373038530349731 500
Epoch 2, training loss 0.696468867006756, time passed 2m 1s, training accuracy 0.49925714285714284
Validation loss 0.6993084137256329, validation accuracy 0.4868
0.6769171357154846 0
0.6965464949607849 100
0.7059141397476196 200
0.70592945814

In [0]:
l_0 = 20
model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
print("SUG l_0={}, momentum=0. :".format(l_0))
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.5, weight_decay=0.)
tr_loss['sug'], times, val_loss['sug'], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sug'],
 'val_loss' : val_loss['sug'],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
torch.save(states, './IMDB/attn_sug_0.5_wd_1e-4')

In [0]:
lrs = [0.0001]
for lr in lrs:
 model = AttentionModel(batch_size, 2, hidden_size, vocab_size, embedding_length, word_embeddings)
 print("SGD lr={}, momentum=0. :".format(lr))
 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
 tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, tr_acc, val_acc = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=valid_iter)
 states = {
 'epoch': n_epochs,
 'state_dict': model.state_dict(),
 'optimizer': optimizer.state_dict(),
 'tr_loss' : tr_loss['sgd'][lr],
 'val_loss' : val_loss['sgd'][lr],
 'lips' : lips,
 'grad' : grad,
 'times' : times,
 'tr_acc' : tr_acc,
 'val_acc' : val_acc
 }
 torch.save(states, './IMDB/atnn_adam_' + str(lr)+'_wd_1e-4')