# Swish Implementation Comparison

In [1]:
# Minimal fork of https://github.com/rwightman/gen-efficientnet-pytorch
# Adds setup and lets you set the activation function
# Note changes on setup branch
# !pip install git+https://github.com/thomasbrandon/gen-efficientnet-pytorch@setup

In [2]:
from fastai.vision import *
from gen_efficientnet.gen_efficientnet import efficientnet_b0, model_urls
import swish_torch

In [3]:
SIZE = 256 # Resize crop to 256x256
BS = 48 # Could probably be a little higher for CUDA/Function but will use same for all
LR=1e-3

## Setup

In [4]:
PATH = untar_data(URLs.IMAGEWOOF_320)
data = (ImageList
          .from_folder(PATH)
          .split_by_folder(valid='val')
          .label_from_folder()
          .transform(([flip_lr(p=0.5)], []), size=SIZE)
          .databunch(bs=BS, num_workers=6)
          .presize(SIZE, scale=(0.35,1))
          .normalize(imagenet_stats))

In [5]:
class PeakMemMetric(LearnerCallback):
    "Callback that measures used and peak GPU memory."
    _order=-20 # Needs to run before the recorder

    def __init__(self, learn:Learner, device=None):
        super().__init__(learn)
        assert torch.cuda.is_available(), "pytorch CUDA is required"
        self._dev = ifnone(device, torch.cuda.current_device())

    def on_train_begin(self, **kwargs):
        self.learn.recorder.add_metric_names(['cache MB',  'alloc MB'])

    def on_epoch_begin(self, **kwargs):
        torch.cuda.reset_max_memory_cached(self._dev)
        torch.cuda.reset_max_memory_allocated(self._dev)
        
    def on_epoch_end(self, last_metrics, **kwargs):
        b2mb = lambda num: int(num/2**20)
        cache = torch.cuda.max_memory_cached(self._dev)
        alloc = torch.cuda.max_memory_allocated(self._dev)
        return add_metrics(last_metrics, [b2mb(cache), b2mb(alloc)])

In [6]:
def load_pretrained(mdl):
    # Load pretrained data, except for differently size linear layers
    state_dict = torch.utils.model_zoo.load_url(model_urls['efficientnet_b0'])
    for attr in ['weight','bias']: state_dict[f'classifier.{attr}'] = getattr(mdl.classifier, attr)
    mdl.load_state_dict(state_dict)

In [7]:
# https://github.com/fastai/imagenette
# Subset of 10 dog breeds from Imagenet, 320px shortest side
data

ImageDataBunch;

Train: LabelList (12454 items)
x: ImageList
Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256)
y: CategoryList
n02111889,n02111889,n02111889,n02111889,n02111889
Path: /home/user/.fastai/data/imagewoof-320;

Valid: LabelList (500 items)
x: ImageList
Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256),Image (3, 256, 256)
y: CategoryList
n02111889,n02111889,n02111889,n02111889,n02111889
Path: /home/user/.fastai/data/imagewoof-320;

Test: None

## Original Implementation

In [8]:
mdl = efficientnet_b0(num_classes=data.c)
load_pretrained(mdl)

In [9]:
mdl.act_fn??

[0;31mSignature:[0m [0mmdl[0m[0;34m.[0m[0mact_fn[0m[0;34m([0m[0mx[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mswish[0m[0;34m([0m[0mx[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0minplace[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mx[0m[0;34m.[0m[0mmul_[0m[0;34m([0m[0mx[0m[0;34m.[0m[0msigmoid[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0;32melse[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mx[0m [0;34m*[0m [0mx[0m[0;34m.[0m[0msigmoid[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/.conda/envs/fastai/lib/python3.7/site-packages/gen_efficientnet/efficientnet_builder.py
[0;31mType:[0m      function


In [10]:
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch,train_loss,valid_loss,accuracy,cache MB,alloc MB,time
0,0.400987,0.370652,0.89,7204,6890,01:12
1,0.439666,0.385724,0.89,7106,6879,01:11
2,0.298581,0.274652,0.91,7106,6879,01:12
3,0.136597,0.231383,0.918,7106,6879,01:11
4,0.075961,0.211751,0.932,7106,6879,01:11


In [None]:
lrn.destroy()
del lrn, mdl

## Autograd Function Implementation

In [8]:
class SwishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i, = ctx.saved_tensors
        if not ctx.needs_input_grad[0]: return (None,)
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
    
# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_function(x, inplace=False): return SwishFunction.apply(x)

mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_function)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch,train_loss,valid_loss,accuracy,cache MB,alloc MB,time
0,0.450081,0.59347,0.882,6432,5421,01:14
1,0.436954,0.368458,0.88,6432,5421,01:13
2,0.262158,0.368661,0.89,6432,5421,01:14
3,0.142793,0.246673,0.928,6432,5421,01:14
4,0.075377,0.240533,0.924,6432,5421,01:14


In [None]:
lrn.destroy()
del lrn, mdl

## CUDA Implementation

In [8]:
# Activation function for gen_efficientnet has an inplace keyword
# Can't be inplace so just ignore
def swish_cuda_fn(x, inplace=False): return swish_torch.swish(x)

mdl = efficientnet_b0(num_classes=data.c, act_fn=swish_cuda_fn)
load_pretrained(mdl)
lrn = Learner(data, mdl, callback_fns=[PeakMemMetric], metrics=[accuracy])
lrn.fit_one_cycle(5, LR)

epoch,train_loss,valid_loss,accuracy,cache MB,alloc MB,time
0,0.444761,0.394772,0.874,5934,5400,01:02
1,0.441538,0.434501,0.866,5934,5400,01:01
2,0.29332,0.27606,0.906,5934,5400,01:02
3,0.149419,0.245342,0.918,5934,5400,01:02
4,0.061624,0.258465,0.918,5934,5400,01:02


In [None]:
lrn.destroy()
del lrn, mdl

# Results
```
		  train_loss  valid_loss  accuracy  cache MB  alloc MB  time
Original  0.075961    0.211751    0.932000  7106      6879      01:11
Autograd  0.075377    0.240533    0.924000  6432      5421      01:14
CUDA      0.061624    0.258465    0.918000  5934      5400      01:02
```

So the CUDA version is (slightly) faster than the original with the memory usage of the Autoigrad version.