<a href="https://colab.research.google.com/github/sverdoot/optimizer-SUG-torch/blob/master/MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Optimization project")
os.getcwd()

file_path = "./MNIST"

try:
    os.stat(file_path)
except:
    os.mkdir(file_path)       

In [0]:
from sug import *

In [0]:
import torch
from torch.optim import Optimizer
import math
import copy

class SUG(Optimizer):
    def __init__(self, params, l_0, d_0=0, prob=1., eps=1e-4, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if l_0 < 0.0:
            raise ValueError("Invalid Lipsitz constant of gradient: {}".format(l_0))
        if d_0 < 0.0:
            raise ValueError("Invalid disperion of gradient: {}".format(d_0))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(L=l_0, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        self.Lips = l_0
        self.prev_Lips = l_0
        self.D_0 = d_0
        self.eps = eps
        self.prob = prob
        self.start_param = params
        self.upd_sq_grad_norm = None
        self.sq_grad_norm = None
        self.loss = torch.tensor(0.)
        self.cur_loss = 0
        self.closure = None
        super(SUG, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SUG, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def comp_batch_size(self):
        """Returns optimal batch size for given d_0, eps and l_0;

        """
        return math.ceil(2 * self.D_0 * self.eps / self.prev_Lips)

    def step(self, loss, closure):
        """Performs a single optimization step.

        Arguments:
            loss : current loss

            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self.start_params = []
        self.loss = loss
        self.sq_grad_norm = 0
        self.cur_loss = loss
        self.closure = closure
        for gr_idx, group in enumerate(self.param_groups):
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            self.start_params.append([])
            for p_idx, p in enumerate(group['params']):
                self.start_params[gr_idx].append([p.data.clone()])
                if p.grad is None:
                    continue
                self.start_params[gr_idx][p_idx].append(p.grad.data.clone())
                d_p = self.start_params[gr_idx][p_idx][1]
                p_ = self.start_params[gr_idx][p_idx][0]
                
                
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                    self.cur_loss += weight_decay * torch.sum(p * p).item()
                   
                
                self.sq_grad_norm += torch.sum(d_p * d_p).item()
                
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                self.start_params[gr_idx][p_idx][1] = d_p
                
        i = 0
        self.Lips = max(self.prev_Lips / 2, 0.1)
        difference = -1
        while difference < 0 or i == 0:
            if (i > 0): 
                self.Lips = max(self.Lips * 2, 0.1)
            for gr_idx, group in enumerate(self.param_groups):
                for p_idx, p in enumerate(group['params']):
                    if p.grad is None:
                        continue
                    start_param_val = self.start_params[gr_idx][p_idx][0]
                    start_param_grad = self.start_params[gr_idx][p_idx][1]
                    p.data = start_param_val - 1/(2*self.Lips) * start_param_grad
            difference, upd_loss = self.stop_criteria()
            i += 1
        self.prev_Lips = self.Lips

        return self.Lips, i

    def stop_criteria(self):
        """Checks if the Lipsitz constant of gradient is appropriate
        
           <g(x_k), w_k - x_k> + 2L_k / 2 ||x_k - w_k||^2 = - 1 / (2L_k)||g(x_k)||^2 + 1 / (4L_k)||g(x_k)||^2 = -1 / (4L_k)||g(x_k)||^2                
        """
        upd_loss = self.closure()
        major =  self.cur_loss - 1 / (4 * self.Lips) * self.sq_grad_norm
        return major - upd_loss - self.l2_reg() + self.eps / 10, upd_loss

    def get_lipsitz_const(self):
        """Returns current Lipsitz constant of the gradient of the loss function
        """
        return self.Lips
    
    def get_sq_grad(self):
        """Returns the current second norm of the gradient of the loss function 
           calculated by the formula
           
           ||f'(p_1,...,p_n)||_2^2 ~ \sum\limits_{i=1}^n ((df/dp_i) * (df/dp_i))(p1,...,p_n))
           
        """
        self.upd_sq_grad_norm = 0
        for gr_idx, group in enumerate(self.param_groups):
            for p_idx, p in enumerate(group['params']):
                if p.grad is None:
                    continue
                self.upd_sq_grad_norm += torch.sum(p.grad.data * p.grad.data).item()
        
        return self.upd_sq_grad_norm
    
    def l2_reg(self):
        """Returns the current l2 regularization addiction
           
        """
        self.upd_l2_reg = 0
        for gr_idx, group in enumerate(self.param_groups):
            weight_decay = group['weight_decay']
            if weight_decay != 0:
                for p_idx, p in enumerate(group['params']):
                    self.upd_l2_reg += weight_decay * torch.sum(p * p).item()
        
        return self.upd_l2_reg

In [0]:
%matplotlib inline
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
import numpy as np

import time

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device

device(type='cuda', index=0)

## Data

In [0]:
batch_size = 512

In [0]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5,), (0.5,))])

trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

valid_dataset = torchvision.datasets.MNIST(root='/data', train=True, 
                                        download=True, transform=transform)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                        download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /data/MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:01, 8639858.13it/s]                            


Extracting /data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /data/MNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 130050.77it/s]           
  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /data/MNIST/raw/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /data/MNIST/raw/t10k-images-idx3-ubyte.gz


1654784it [00:00, 2357733.35it/s]                           
0it [00:00, ?it/s]

Extracting /data/MNIST/raw/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /data/MNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 49203.85it/s]            


Extracting /data/MNIST/raw/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [0]:
valid_size=0.15
num_train = len(trainset)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))

np.random.seed(42)
np.random.shuffle(indices)

train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

trainloader = torch.utils.data.DataLoader(trainset, 
               batch_size=batch_size, sampler=train_sampler,
               num_workers=2)

validloader = torch.utils.data.DataLoader(valid_dataset, 
               batch_size=batch_size, sampler=valid_sampler,
               num_workers=2)

In [0]:
def show_batch(batch):
    im = torchvision.utils.make_grid(batch)
    plt.imshow(np.transpose(im.numpy(), (1, 2, 0)))
    
dataiter = iter(trainloader)
images, labels = dataiter.next()

print('Labels: ', labels)
print('Batch shape: ', images.size())
show_batch(images)

## Models

#### Logistic Regression

In [0]:
class LR(nn.Module):
    def __init__(self):
        super(LR, self).__init__()
        self.linear1 = nn.Linear(28*28, 10)

    def forward(self, x):
        batch_size = x.size(0)
        y_pred = F.log_softmax(self.linear1(x.view(batch_size, -1)), -1)
        return y_pred

###   FC

In [0]:
class FC(nn.Module):
    def __init__(self):
        super(FC, self).__init__()
        self.linear1 = nn.Linear(28*28, 256)
        self.linear2 = nn.Linear(256, 10)

    def forward(self, x):
        batch_size = x.size(0)
        h_relu = F.relu(self.linear1(x.view(batch_size, -1)))
        y_pred = self.linear2(h_relu)
        return y_pred

### Conv

In [0]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=5)
        self.conv3 = nn.Conv2d(32,64, kernel_size=5)
        self.fc1 = nn.Linear(3*3*64, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        #x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(F.max_pool2d(self.conv3(x),2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.view(-1,3*3*64 )
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

## Train

In [0]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def model_step(model, optimizer, criterion, inputs, labels):
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    if model.training:
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
    if optimizer.__class__.__name__ != 'SUG':
        optimizer.step()
    else:
        def closure():
            optimizer.zero_grad()
            upd_outputs = model(inputs)
            upd_loss = criterion(upd_outputs, labels).item()

            return upd_loss

        optimizer.step(loss.item(), closure)

    return loss.item()

In [0]:
def train(model, trainloader, criterion, optimizer, n_epochs=2, validloader=None, eps=1e-5, print_every=1):
    tr_loss, val_loss, lips, times, grad, acc = ([] for i in range(6))
    start_time = time.time()
    model.to(device=device)
    for ep in range(n_epochs):
        model.train()
        i = 0
        for i, data in enumerate(trainloader):
            inputs, labels = data
           
            inputs, labels = Variable(inputs).to(device=device), Variable(labels).to(device=device)
            #print(inputs.size())
            tr_loss.append(model_step(model, optimizer, criterion, inputs, labels))
            if optimizer.__class__.__name__ == 'SUG':
                lips.append(optimizer.get_lipsitz_const())
                grad.append(optimizer.get_sq_grad)
        times.append(time_since(start_time))
        if ep % print_every == 0:
            print("Epoch {}, training loss {}, time passed {}".format(ep, sum(tr_loss[-i:]) / i, time_since(start_time)))

        if validloader is None:
            continue
        model.zero_grad()
        model.eval()
        j = 0
        count = 0
        n_ex = 0
        for j, data in enumerate(validloader):
            inputs, labels = data
            inputs, labels = inputs.to(device=device), labels.to(device=device)
            outputs = model(inputs)
            count += (torch.argmax(outputs, 1) == labels).float().sum().item()
            n_ex += outputs.size(0) 
            val_loss.append(criterion(outputs, labels).item())
        acc.append(count / n_ex)
        if ep % print_every == 0:
            print("Validation loss {}, validation accuracy {}".format(sum(val_loss[-j:]) / j, acc[-1]))
        
    return tr_loss, times, val_loss, lips, grad, acc

In [0]:
print_every = 4
n_epochs = 10
tr_loss = {}
tr_loss['sgd'] = {}
val_loss = {}
val_loss['sgd'] = {}
lrs = [0.05, 0.01, 0.005]
criterion = nn.CrossEntropyLoss(reduction="mean")
torch.manual_seed(999)

<torch._C.Generator at 0x7f50374f9670>

In [0]:
def concat_states(state1, state2):
    states = {
            'epoch': state1['epoch'] + state2['epoch'],
            'state_dict': state2['state_dict'],
            'optimizer': state2['optimizer'],
            'tr_loss' : state1['tr_loss'] + state2['tr_loss'],
            'val_loss' : state1['val_loss'] + state2['val_loss'],
            'lips' : state1['lips'] + state2['lips'],
            'grad' : state1['grad'] + state2['grad'],
            #'times' : state1['times'] + list(map(lambda x: x + state1['times'][-1],state2['times']))
             'times' : state1['times'] + state2['times']
             }
    return states

###LR

In [0]:
n_epochs = 20

In [0]:
for lr in lrs:
  model = LR()
  print("SGD  lr={}, momentum=0. :".format(lr))
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0., weight_decay=1e-3)
  tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
  states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
             }
  torch.save(states, './MNIST/LR_' + str(lr))

SGD  lr=0.05, momentum=0. :
Epoch 0, training loss 0.8053867536963839, time passed 0m 4s
Validation loss 0.4970751264516045, validation accuracy 0.8594444444444445
Epoch 4, training loss 0.3556738823953301, time passed 0m 29s
Validation loss 0.34582727095660043, validation accuracy 0.8994444444444445
Epoch 8, training loss 0.324508320201527, time passed 0m 55s
Validation loss 0.32668687315548167, validation accuracy 0.9034444444444445
Epoch 12, training loss 0.31013485217335246, time passed 1m 17s
Validation loss 0.3108014885117026, validation accuracy 0.9085555555555556
Epoch 16, training loss 0.3015503261727516, time passed 1m 40s
Validation loss 0.3028887212276459, validation accuracy 0.9145555555555556
SGD  lr=0.01, momentum=0. :
Epoch 0, training loss 1.2965486091796798, time passed 0m 4s
Validation loss 0.8687916503233069, validation accuracy 0.8221111111111111
Epoch 4, training loss 0.49756273416557695, time passed 0m 27s
Validation loss 0.4874144634779762, validation accuracy 0

In [0]:
l_0 = 20
model = LR()
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0., weight_decay=1e-3)
tr_loss['sug'], times, val_loss['sug'], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
         }
torch.save(states, './MNIST/LR_sug')

Epoch 0, training loss 0.8922390636771617, time passed 0m 4s
Validation loss 0.5756662032183479, validation accuracy 0.8471111111111111
Epoch 4, training loss 0.36624673069125474, time passed 0m 29s
Validation loss 0.35796097797505994, validation accuracy 0.8953333333333333
Epoch 8, training loss 0.3267748903746557, time passed 0m 54s
Validation loss 0.32799020935507384, validation accuracy 0.906
Epoch 12, training loss 0.30954578579074205, time passed 1m 18s
Validation loss 0.31454072805011973, validation accuracy 0.9114444444444444
Epoch 16, training loss 0.29919745840809564, time passed 1m 42s
Validation loss 0.2986759543418884, validation accuracy 0.9141111111111111


### FC

In [0]:
n_epochs = 20

In [0]:
torch.manual_seed(999)
for lr in lrs:
  model = FC()
  print("SGD  lr={}, momentum=0. :".format(lr))
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0., weight_decay=1e-3)
  tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
  states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
             }
  torch.save(states, './MNIST/FC_' + str(lr))

SGD  lr=0.05, momentum=0. :
Epoch 0, training loss 0.9827026613432952, time passed 0m 4s
Validation loss 0.521831205662559, validation accuracy 0.8611111111111112
Epoch 4, training loss 0.32333635019533563, time passed 0m 27s
Validation loss 0.33837247420759764, validation accuracy 0.8983333333333333
Epoch 8, training loss 0.2735136151313782, time passed 0m 52s
Validation loss 0.27770407059613395, validation accuracy 0.9204444444444444
Epoch 12, training loss 0.2350455638435152, time passed 1m 14s
Validation loss 0.2334472729879267, validation accuracy 0.9327777777777778
Epoch 16, training loss 0.20252520448029643, time passed 1m 37s
Validation loss 0.21084140153492198, validation accuracy 0.9381111111111111
SGD  lr=0.01, momentum=0. :
Epoch 0, training loss 1.8455105502196032, time passed 0m 6s
Validation loss 1.3969286049113554, validation accuracy 0.7461111111111111
Epoch 4, training loss 0.5331706323406913, time passed 0m 28s
Validation loss 0.5103560914011562, validation accuracy 

In [0]:
l_0 = 20
model = FC()
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0., weight_decay=1e-3)
tr_loss['sug'], times, val_loss['sug'], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
         }
torch.save(states, './MNIST/FC_sug')

Epoch 0, training loss 0.841091928457973, time passed 0m 5s
Validation loss 0.5276602085898904, validation accuracy 0.862
Epoch 4, training loss 0.3375983780080622, time passed 0m 31s
Validation loss 0.32630446903845844, validation accuracy 0.9072222222222223
Epoch 8, training loss 0.2841122362649802, time passed 0m 55s
Validation loss 0.28937043161953196, validation accuracy 0.914
Epoch 12, training loss 0.2409512424709821, time passed 1m 19s
Validation loss 0.24036436747102177, validation accuracy 0.9277777777777778
Epoch 16, training loss 0.20427992352933594, time passed 1m 45s
Validation loss 0.20693528213921716, validation accuracy 0.9405555555555556


+ momentum

In [0]:
torch.manual_seed(999)
for lr in lrs:
  model = FC()
  print("SGD  lr={}, momentum=0.9 :".format(lr))
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-3)
  tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
  states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
             }
  torch.save(states, './MNIST/FC_' + str(lr)+'_0.9')

SGD  lr=0.05, momentum=0.9 :
Epoch 0, training loss 0.5381249714680393, time passed 0m 5s
Validation loss 0.28269324670819673, validation accuracy 0.9155555555555556
Epoch 4, training loss 0.12210867298070831, time passed 0m 29s
Validation loss 0.1238884250907337, validation accuracy 0.9644444444444444
Epoch 8, training loss 0.08087855562417194, time passed 0m 56s
Validation loss 0.11202001571655273, validation accuracy 0.9683333333333334
Epoch 12, training loss 0.06308559685794994, time passed 1m 18s
Validation loss 0.08321712078417048, validation accuracy 0.9768888888888889
Epoch 16, training loss 0.05535656740569105, time passed 1m 40s
Validation loss 0.09248179460273069, validation accuracy 0.9728888888888889
SGD  lr=0.01, momentum=0.9 :
Epoch 0, training loss 0.8471114117689807, time passed 0m 4s
Validation loss 0.39922235643162446, validation accuracy 0.886
Epoch 4, training loss 0.2757103128565682, time passed 0m 29s
Validation loss 0.26469812410719257, validation accuracy 0.924

In [0]:
torch.manual_seed(999)
l_0 = 20
model = FC()
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.9, weight_decay=1e-3)
tr_loss['sug'], times, val_loss['sug'], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
         }
torch.save(states, './MNIST/FC_sug_0.9')

Epoch 0, training loss 0.5774111350377401, time passed 0m 5s
Validation loss 0.4330375124426449, validation accuracy 0.8753333333333333
Epoch 4, training loss 0.35169939471013617, time passed 0m 29s
Validation loss 0.3514442969770992, validation accuracy 0.8977777777777778
Epoch 8, training loss 0.3251004270230881, time passed 0m 55s
Validation loss 0.3274660198127522, validation accuracy 0.9045555555555556
Epoch 12, training loss 0.309954424398114, time passed 1m 20s
Validation loss 0.3091518423136543, validation accuracy 0.9075555555555556
Epoch 16, training loss 0.2942978652438732, time passed 1m 47s
Validation loss 0.2984069471850115, validation accuracy 0.9126666666666666


### CNN

In [0]:
for lr in lrs:
  model = CNN()
  print("SGD  lr={}, momentum=0. :".format(lr))
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0., weight_decay=1e-3)
  tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
  states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
             }
  torch.save(states, './MNIST/CNN_' + str(lr))

In [0]:
#n_epochs = 8
l_0 = 20
model = CNN()
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0., weight_decay=1e-3)
tr_loss['sug'], times, val_loss['sug'], lips, grad, acc = train(model, trainloader, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=validloader)
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad,
            'times' : times,
            'acc' : acc
         }
torch.save(states, './MNIST/CNN_sug')