Deep Learning Models -- A collection of various deep learning architectures, models, and tips for TensorFlow and PyTorch in Jupyter Notebooks.
- Author: Sebastian Raschka
- GitHub Repository: https://github.com/rasbt/deeplearning-models

In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -v -p torch

Sebastian Raschka 

CPython 3.7.3
IPython 7.9.0

torch 1.3.0


# Filter Response Normalization for Network-in-Network CIFAR-10 Classifier

The CNN architecture is based on 

- Lin, Min, Qiang Chen, and Shuicheng Yan. "[Network in network](https://arxiv.org/abs/1312.4400)." arXiv preprint arXiv:1312.4400 (2013).

This notebook implements Filter Response Normalization as a drop-in replacement for BatchNorm, based on the paper:

- S. Singh and S. Krishnan (2019). **Filter Response Normalization Layer: Eliminating Batch Dependence in the Training of Deep Neural Networks.** https://arxiv.org/abs/1911.09737

In [2]:
import torch
import torch.nn as nn


class FilterResponseNormalization(nn.Module):
 def __init__(self, num_features, eps=1e-6):
 super(FilterResponseNormalization, self).__init__()
 
 self.register_parameter('beta', 
 torch.nn.Parameter(
 torch.empty([1, num_features, 1, 1]).normal_()))
 
 self.register_parameter('gamma', 
 torch.nn.Parameter(
 torch.empty([1, num_features, 1, 1]).normal_()))
 
 self.register_parameter('tau', 
 torch.nn.Parameter(
 torch.empty([1, num_features, 1, 1]).normal_()))
 
 self.eps = torch.Tensor([eps])

 def forward(self, x):
 # forward function based on
 # https://github.com/gupta-abhay/pytorch-frn/blob/master/frn.py
 n, c, h, w = x.size()
 
 self.eps = self.eps.to(self.tau.device)

 nu2 = torch.mean(x.pow(2), (2, 3), keepdims=True)
 x = x * torch.rsqrt(nu2 + torch.abs(self.eps))
 return torch.max(self.gamma*x + self.beta, self.tau)

## Additional Imports

In [3]:
import os
import time

import numpy as np
import pandas as pd

import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Subset

from torchvision import datasets
from torchvision import transforms

import matplotlib.pyplot as plt
from PIL import Image


if torch.cuda.is_available():
 torch.backends.cudnn.deterministic = True

## Model Settings

In [4]:
##########################
### SETTINGS
##########################

# Hyperparameters
RANDOM_SEED = 1
LEARNING_RATE = 0.00005
BATCH_SIZE = 256
NUM_EPOCHS = 100

# Architecture
NUM_CLASSES = 10

# Other
DEVICE = "cuda:0"
GRAYSCALE = False

In [5]:
##########################
### CIFAR-10 Dataset
##########################


# Note transforms.ToTensor() scales input images
# to 0-1 range


train_indices = torch.arange(0, 49000)
valid_indices = torch.arange(49000, 50000)


train_and_valid = datasets.CIFAR10(root='data', 
 train=True, 
 transform=transforms.ToTensor(),
 download=True)

train_dataset = Subset(train_and_valid, train_indices)
valid_dataset = Subset(train_and_valid, valid_indices)


test_dataset = datasets.CIFAR10(root='data', 
 train=False, 
 transform=transforms.ToTensor())


#####################################################
### Data Loaders
#####################################################

train_loader = DataLoader(dataset=train_dataset, 
 batch_size=BATCH_SIZE,
 num_workers=8,
 shuffle=True)

valid_loader = DataLoader(dataset=valid_dataset, 
 batch_size=BATCH_SIZE,
 num_workers=8,
 shuffle=False)

test_loader = DataLoader(dataset=test_dataset, 
 batch_size=BATCH_SIZE,
 num_workers=8,
 shuffle=False)

#####################################################

# Checking the dataset
for images, labels in train_loader: 
 print('Image batch dimensions:', images.shape)
 print('Image label dimensions:', labels.shape)
 break

for images, labels in test_loader: 
 print('Image batch dimensions:', images.shape)
 print('Image label dimensions:', labels.shape)
 break
 
for images, labels in valid_loader: 
 print('Image batch dimensions:', images.shape)
 print('Image label dimensions:', labels.shape)
 break

Files already downloaded and verified
Image batch dimensions: torch.Size([256, 3, 32, 32])
Image label dimensions: torch.Size([256])
Image batch dimensions: torch.Size([256, 3, 32, 32])
Image label dimensions: torch.Size([256])
Image batch dimensions: torch.Size([256, 3, 32, 32])
Image label dimensions: torch.Size([256])


# Filter Response Normalization

In [6]:
##########################
### MODEL
##########################


class NiN(nn.Module):
 def __init__(self, num_classes):
 super(NiN, self).__init__()
 self.num_classes = num_classes
 self.classifier = nn.Sequential(
 nn.Conv2d(3, 192, kernel_size=5, stride=1, padding=2, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.Conv2d(192, 160, kernel_size=1, stride=1, padding=0, bias=False),
 FilterResponseNormalization(160),
 #nn.ReLU(inplace=True),
 nn.Conv2d(160, 96, kernel_size=1, stride=1, padding=0, bias=False),
 FilterResponseNormalization(96),
 #nn.ReLU(inplace=True),
 nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
 nn.Dropout(0.5),

 nn.Conv2d(96, 192, kernel_size=5, stride=1, padding=2, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.AvgPool2d(kernel_size=3, stride=2, padding=1),
 nn.Dropout(0.5),

 nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 FilterResponseNormalization(192),
 #nn.ReLU(inplace=True),
 nn.Conv2d(192, 10, kernel_size=1, stride=1, padding=0),
 nn.ReLU(inplace=True),
 nn.AvgPool2d(kernel_size=8, stride=1, padding=0),

 )

 def forward(self, x):
 x = self.classifier(x)
 logits = x.view(x.size(0), self.num_classes)
 probas = torch.softmax(logits, dim=1)
 return logits, probas

In [7]:
torch.manual_seed(RANDOM_SEED)

model = NiN(NUM_CLASSES)
model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) 

In [8]:
def compute_accuracy(model, data_loader, device):
 correct_pred, num_examples = 0, 0
 for i, (features, targets) in enumerate(data_loader):
 
 features = features.to(device)
 targets = targets.to(device)

 logits, probas = model(features)
 _, predicted_labels = torch.max(probas, 1)
 num_examples += targets.size(0)
 correct_pred += (predicted_labels == targets).sum()
 return correct_pred.float()/num_examples * 100
 

start_time = time.time()
for epoch in range(NUM_EPOCHS):
 
 model.train()
 
 for batch_idx, (features, targets) in enumerate(train_loader):
 
 ### PREPARE MINIBATCH
 features = features.to(DEVICE)
 targets = targets.to(DEVICE)
 
 ### FORWARD AND BACK PROP
 logits, probas = model(features)
 cost = F.cross_entropy(logits, targets)
 optimizer.zero_grad()
 
 cost.backward()
 
 ### UPDATE MODEL PARAMETERS
 optimizer.step()
 
 ### LOGGING
 if not batch_idx % 120:
 print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
 f'Batch {batch_idx:03d}/{len(train_loader):03d} |' 
 f' Cost: {cost:.4f}')

 # no need to build the computation graph for backprop when computing accuracy
 with torch.set_grad_enabled(False):
 train_acc = compute_accuracy(model, train_loader, device=DEVICE)
 valid_acc = compute_accuracy(model, valid_loader, device=DEVICE)
 print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} Train Acc.: {train_acc:.2f}%'
 f' | Validation Acc.: {valid_acc:.2f}%')
 
 elapsed = (time.time() - start_time)/60
 print(f'Time elapsed: {elapsed:.2f} min')
 
elapsed = (time.time() - start_time)/60
print(f'Total Training Time: {elapsed:.2f} min')

Epoch: 001/100 | Batch 000/192 | Cost: 2.3138
Epoch: 001/100 | Batch 120/192 | Cost: 2.1962
Epoch: 001/100 Train Acc.: 22.35% | Validation Acc.: 24.80%
Time elapsed: 0.60 min
Epoch: 002/100 | Batch 000/192 | Cost: 2.1328
Epoch: 002/100 | Batch 120/192 | Cost: 2.0764
Epoch: 002/100 Train Acc.: 24.54% | Validation Acc.: 26.70%
Time elapsed: 1.21 min
Epoch: 003/100 | Batch 000/192 | Cost: 2.0738
Epoch: 003/100 | Batch 120/192 | Cost: 1.9631
Epoch: 003/100 Train Acc.: 28.22% | Validation Acc.: 32.10%
Time elapsed: 1.80 min
Epoch: 004/100 | Batch 000/192 | Cost: 2.0147
Epoch: 004/100 | Batch 120/192 | Cost: 1.9606
Epoch: 004/100 Train Acc.: 29.99% | Validation Acc.: 31.10%
Time elapsed: 2.40 min
Epoch: 005/100 | Batch 000/192 | Cost: 1.9880
Epoch: 005/100 | Batch 120/192 | Cost: 1.9411
Epoch: 005/100 Train Acc.: 31.99% | Validation Acc.: 33.40%
Time elapsed: 3.00 min
Epoch: 006/100 | Batch 000/192 | Cost: 1.9271
Epoch: 006/100 | Batch 120/192 | Cost: 1.9257
Epoch: 006/100 Train Acc.: 36.23%

## Batch Normalization (for comparison)

In [9]:
##########################
### MODEL
##########################


class NiN(nn.Module):
 def __init__(self, num_classes):
 super(NiN, self).__init__()
 self.num_classes = num_classes
 self.classifier = nn.Sequential(
 nn.Conv2d(3, 192, kernel_size=5, stride=1, padding=2, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.Conv2d(192, 160, kernel_size=1, stride=1, padding=0, bias=False),
 nn.BatchNorm2d(160),
 nn.ReLU(inplace=True),
 nn.Conv2d(160, 96, kernel_size=1, stride=1, padding=0, bias=False),
 nn.BatchNorm2d(96),
 nn.ReLU(inplace=True),
 nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
 nn.Dropout(0.5),

 nn.Conv2d(96, 192, kernel_size=5, stride=1, padding=2, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.AvgPool2d(kernel_size=3, stride=2, padding=1),
 nn.Dropout(0.5),

 nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.Conv2d(192, 192, kernel_size=1, stride=1, padding=0, bias=False),
 nn.BatchNorm2d(192),
 nn.ReLU(inplace=True),
 nn.Conv2d(192, 10, kernel_size=1, stride=1, padding=0),
 nn.ReLU(inplace=True),
 nn.AvgPool2d(kernel_size=8, stride=1, padding=0),

 )

 def forward(self, x):
 x = self.classifier(x)
 logits = x.view(x.size(0), self.num_classes)
 probas = torch.softmax(logits, dim=1)
 return logits, probas

In [10]:
start_time = time.time()
for epoch in range(NUM_EPOCHS):
 
 model.train()
 
 for batch_idx, (features, targets) in enumerate(train_loader):
 
 ### PREPARE MINIBATCH
 features = features.to(DEVICE)
 targets = targets.to(DEVICE)
 
 ### FORWARD AND BACK PROP
 logits, probas = model(features)
 cost = F.cross_entropy(logits, targets)
 optimizer.zero_grad()
 
 cost.backward()
 
 ### UPDATE MODEL PARAMETERS
 optimizer.step()
 
 ### LOGGING
 if not batch_idx % 120:
 print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
 f'Batch {batch_idx:03d}/{len(train_loader):03d} |' 
 f' Cost: {cost:.4f}')

 # no need to build the computation graph for backprop when computing accuracy
 with torch.set_grad_enabled(False):
 train_acc = compute_accuracy(model, train_loader, device=DEVICE)
 valid_acc = compute_accuracy(model, valid_loader, device=DEVICE)
 print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} Train Acc.: {train_acc:.2f}%'
 f' | Validation Acc.: {valid_acc:.2f}%')
 
 elapsed = (time.time() - start_time)/60
 print(f'Time elapsed: {elapsed:.2f} min')
 
elapsed = (time.time() - start_time)/60
print(f'Total Training Time: {elapsed:.2f} min')

Epoch: 001/100 | Batch 000/192 | Cost: 0.6825
Epoch: 001/100 | Batch 120/192 | Cost: 0.7027
Epoch: 001/100 Train Acc.: 76.08% | Validation Acc.: 72.60%
Time elapsed: 0.60 min
Epoch: 002/100 | Batch 000/192 | Cost: 0.6714
Epoch: 002/100 | Batch 120/192 | Cost: 0.6445
Epoch: 002/100 Train Acc.: 76.35% | Validation Acc.: 73.40%
Time elapsed: 1.20 min
Epoch: 003/100 | Batch 000/192 | Cost: 0.7066
Epoch: 003/100 | Batch 120/192 | Cost: 0.6379
Epoch: 003/100 Train Acc.: 76.63% | Validation Acc.: 73.80%
Time elapsed: 1.80 min
Epoch: 004/100 | Batch 000/192 | Cost: 0.7371
Epoch: 004/100 | Batch 120/192 | Cost: 0.7181
Epoch: 004/100 Train Acc.: 75.50% | Validation Acc.: 70.80%
Time elapsed: 2.40 min
Epoch: 005/100 | Batch 000/192 | Cost: 0.7634
Epoch: 005/100 | Batch 120/192 | Cost: 0.6887
Epoch: 005/100 Train Acc.: 77.08% | Validation Acc.: 73.50%
Time elapsed: 3.00 min
Epoch: 006/100 | Batch 000/192 | Cost: 0.6509
Epoch: 006/100 | Batch 120/192 | Cost: 0.7140
Epoch: 006/100 Train Acc.: 76.52%