{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from exp.nb_06 import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ConvNet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's get the data and training interface from where we left in the last notebook." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=5899)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train,y_train,x_valid,y_valid = get_data()\n", "\n", "x_train,x_valid = normalize_to(x_train,x_valid)\n", "train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)\n", "\n", "nh,bs = 50,512\n", "c = y_train.max().item()+1\n", "loss_func = F.cross_entropy\n", "\n", "data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mnist_view = view_tfm(1,28,28)\n", "cbfs = [Recorder,\n", " partial(AvgStatsCallback,accuracy),\n", " CudaCallback,\n", " partial(BatchTransformXCallback, mnist_view)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nfs = [8,16,32,64,64]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [1.024234921875, tensor(0.6730, device='cuda:0')]\n", "valid: [0.2444910400390625, tensor(0.9262, device='cuda:0')]\n", "train: [0.162599970703125, tensor(0.9502, device='cuda:0')]\n", "valid: [0.10074585571289063, tensor(0.9698, device='cuda:0')]\n", "CPU times: user 3.78 s, sys: 1.61 s, total: 5.39 s\n", "Wall time: 6.37 s\n" ] } ], "source": [ "%time run.fit(2, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Batchnorm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Custom" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start by building our own `BatchNorm` layer from scratch." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=6018)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class BatchNorm(nn.Module):\n", " def __init__(self, nf, mom=0.1, eps=1e-5):\n", " super().__init__()\n", " # NB: pytorch bn mom is opposite of what you'd expect\n", " self.mom,self.eps = mom,eps\n", " self.mults = nn.Parameter(torch.ones (nf,1,1))\n", " self.adds = nn.Parameter(torch.zeros(nf,1,1))\n", " self.register_buffer('vars', torch.ones(1,nf,1,1))\n", " self.register_buffer('means', torch.zeros(1,nf,1,1))\n", "\n", " def update_stats(self, x):\n", " m = x.mean((0,2,3), keepdim=True)\n", " v = x.var ((0,2,3), keepdim=True)\n", " self.means.lerp_(m, self.mom)\n", " self.vars.lerp_ (v, self.mom)\n", " return m,v\n", " \n", " def forward(self, x):\n", " if self.training:\n", " with torch.no_grad(): m,v = self.update_stats(x)\n", " else: m,v = self.means,self.vars\n", " x = (x-m) / (v+self.eps).sqrt()\n", " return x*self.mults + self.adds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " # No bias needed if using bn\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(BatchNorm(nf))\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def init_cnn_(m, f):\n", " if isinstance(m, nn.Conv2d):\n", " f(m.weight, a=0.1)\n", " if getattr(m, 'bias', None) is not None: m.bias.data.zero_()\n", " for l in m.children(): init_cnn_(l, f)\n", "\n", "def init_cnn(m, uniform=False):\n", " f = init.kaiming_uniform_ if uniform else init.kaiming_normal_\n", " init_cnn_(m, f)\n", "\n", "def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):\n", " model = get_cnn_model(data, nfs, layer, **kwargs)\n", " init_cnn(model, uniform=uniform)\n", " return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can then use it in training and see how it helps keep the activations means to 0 and the std to 1." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.26532763671875, tensor(0.9189, device='cuda:0')]\n", "valid: [0.16395225830078125, tensor(0.9520, device='cuda:0')]\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "with Hooks(learn.model, append_stats) as hooks:\n", " run.fit(1, learn)\n", " fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))\n", " for h in hooks[:-1]:\n", " ms,ss = h.stats\n", " ax0.plot(ms[:10])\n", " ax1.plot(ss[:10])\n", " h.remove()\n", " plt.legend(range(6));\n", " \n", " fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))\n", " for h in hooks[:-1]:\n", " ms,ss = h.stats\n", " ax0.plot(ms)\n", " ax1.plot(ss)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 1.0, conv_layer, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.27833810546875, tensor(0.9105, device='cuda:0')]\n", "valid: [0.1912491943359375, tensor(0.9386, device='cuda:0')]\n", "train: [0.08977265625, tensor(0.9713, device='cuda:0')]\n", "valid: [0.09156090698242188, tensor(0.9716, device='cuda:0')]\n", "train: [0.06145498046875, tensor(0.9810, device='cuda:0')]\n", "valid: [0.09970919799804688, tensor(0.9707, device='cuda:0')]\n", "CPU times: user 3.71 s, sys: 584 ms, total: 4.29 s\n", "Wall time: 4.29 s\n" ] } ], "source": [ "%time run.fit(3, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Builtin batchnorm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=6679)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1))\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.27115255859375, tensor(0.9206, device='cuda:0')]\n", "valid: [0.1547997314453125, tensor(0.9496, device='cuda:0')]\n", "train: [0.07861462890625, tensor(0.9755, device='cuda:0')]\n", "valid: [0.07472044067382813, tensor(0.9776, device='cuda:0')]\n", "train: [0.0570835498046875, tensor(0.9818, device='cuda:0')]\n", "valid: [0.0673232666015625, tensor(0.9813, device='cuda:0')]\n", "CPU times: user 3.37 s, sys: 747 ms, total: 4.12 s\n", "Wall time: 4.12 s\n" ] } ], "source": [ "%time run.fit(3, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With scheduler" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's add the usual warm-up/annealing." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sched = combine_scheds([0.3, 0.7], [sched_lin(0.6, 2.), sched_lin(2., 0.1)]) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs\n", " +[partial(ParamScheduler,'lr', sched)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.2881914453125, tensor(0.9116, device='cuda:0')]\n", "valid: [0.5269224609375, tensor(0.8394, device='cuda:0')]\n", "train: [0.153792421875, tensor(0.9551, device='cuda:0')]\n", "valid: [0.462953662109375, tensor(0.8514, device='cuda:0')]\n", "train: [0.087637158203125, tensor(0.9736, device='cuda:0')]\n", "valid: [0.07029829711914062, tensor(0.9788, device='cuda:0')]\n", "train: [0.049282421875, tensor(0.9853, device='cuda:0')]\n", "valid: [0.08710025024414063, tensor(0.9740, device='cuda:0')]\n", "train: [0.0356986328125, tensor(0.9888, device='cuda:0')]\n", "valid: [0.07853966674804687, tensor(0.9773, device='cuda:0')]\n", "train: [0.0268300439453125, tensor(0.9918, device='cuda:0')]\n", "valid: [0.04807376098632812, tensor(0.9870, device='cuda:0')]\n", "train: [0.0219412109375, tensor(0.9939, device='cuda:0')]\n", "valid: [0.04363873901367187, tensor(0.9882, device='cuda:0')]\n", "train: [0.018501048583984374, tensor(0.9951, device='cuda:0')]\n", "valid: [0.04355916137695313, tensor(0.9877, device='cuda:0')]\n" ] } ], "source": [ "run.fit(8, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## More norms" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Layer norm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From [the paper](https://arxiv.org/abs/1607.06450): \"*batch normalization cannot be applied to online learning tasks or to extremely large distributed models where the minibatches have to be small*\"." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "General equation for a norm layer with learnable affine:\n", "\n", "$$y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\epsilon}} * \\gamma + \\beta$$\n", "\n", "The difference with BatchNorm is\n", "1. we don't keep a moving average\n", "2. we don't average over the batches dimension but over the hidden dimension, so it's independent of the batch size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=6717)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class LayerNorm(nn.Module):\n", " __constants__ = ['eps']\n", " def __init__(self, eps=1e-5):\n", " super().__init__()\n", " self.eps = eps\n", " self.mult = nn.Parameter(tensor(1.))\n", " self.add = nn.Parameter(tensor(0.))\n", "\n", " def forward(self, x):\n", " m = x.mean((1,2,3), keepdim=True)\n", " v = x.var ((1,2,3), keepdim=True)\n", " x = (x-m) / ((v+self.eps).sqrt())\n", " return x*self.mult + self.add" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def conv_ln(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(LayerNorm())\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.8, conv_ln, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [nan, tensor(0.1321, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "train: [nan, tensor(0.0986, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "train: [nan, tensor(0.0986, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "CPU times: user 4.56 s, sys: 862 ms, total: 5.42 s\n", "Wall time: 5.42 s\n" ] } ], "source": [ "%time run.fit(3, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Thought experiment*: can this distinguish foggy days from sunny days (assuming you're using it before the first conv)?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Instance norm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From [the paper](https://arxiv.org/abs/1607.08022): " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The key difference between **contrast** and batch normalization is that the latter applies the normalization to a whole batch of images instead for single ones:\n", "\n", "\\begin{equation}\\label{eq:bnorm}\n", " y_{tijk} = \\frac{x_{tijk} - \\mu_{i}}{\\sqrt{\\sigma_i^2 + \\epsilon}},\n", " \\quad\n", " \\mu_i = \\frac{1}{HWT}\\sum_{t=1}^T\\sum_{l=1}^W \\sum_{m=1}^H x_{tilm},\n", " \\quad\n", " \\sigma_i^2 = \\frac{1}{HWT}\\sum_{t=1}^T\\sum_{l=1}^W \\sum_{m=1}^H (x_{tilm} - mu_i)^2.\n", "\\end{equation}\n", "\n", "In order to combine the effects of instance-specific normalization and batch normalization, we propose to replace the latter by the *instance normalization* (also known as *contrast normalization*) layer:\n", "\n", "\\begin{equation}\\label{eq:inorm}\n", " y_{tijk} = \\frac{x_{tijk} - \\mu_{ti}}{\\sqrt{\\sigma_{ti}^2 + \\epsilon}},\n", " \\quad\n", " \\mu_{ti} = \\frac{1}{HW}\\sum_{l=1}^W \\sum_{m=1}^H x_{tilm},\n", " \\quad\n", " \\sigma_{ti}^2 = \\frac{1}{HW}\\sum_{l=1}^W \\sum_{m=1}^H (x_{tilm} - mu_{ti})^2.\n", "\\end{equation}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=7114)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class InstanceNorm(nn.Module):\n", " __constants__ = ['eps']\n", " def __init__(self, nf, eps=1e-0):\n", " super().__init__()\n", " self.eps = eps\n", " self.mults = nn.Parameter(torch.ones (nf,1,1))\n", " self.adds = nn.Parameter(torch.zeros(nf,1,1))\n", "\n", " def forward(self, x):\n", " m = x.mean((2,3), keepdim=True)\n", " v = x.var ((2,3), keepdim=True)\n", " res = (x-m) / ((v+self.eps).sqrt())\n", " return res*self.mults + self.adds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def conv_in(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(InstanceNorm(nf))\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.1, conv_in, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [nan, tensor(0.0986, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "train: [nan, tensor(0.0986, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "train: [nan, tensor(0.0986, device='cuda:0')]\n", "valid: [nan, tensor(0.0991, device='cuda:0')]\n", "CPU times: user 4.46 s, sys: 718 ms, total: 5.18 s\n", "Wall time: 5.18 s\n" ] } ], "source": [ "%time run.fit(3, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Question*: why can't this classify anything?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lost in all those norms? The authors from the [group norm paper](https://arxiv.org/pdf/1803.08494.pdf) have you covered:\n", "\n", "![Various norms](images/norms.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Group norm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=7213)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*From the PyTorch docs:*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`GroupNorm(num_groups, num_channels, eps=1e-5, affine=True)`\n", "\n", "The input channels are separated into `num_groups` groups, each containing\n", "``num_channels / num_groups`` channels. The mean and standard-deviation are calculated\n", "separately over the each group. $\\gamma$ and $\\beta$ are learnable\n", "per-channel affine transform parameter vectorss of size `num_channels` if\n", "`affine` is ``True``.\n", "\n", "This layer uses statistics computed from input data in both training and\n", "evaluation modes.\n", "\n", "Args:\n", "- num_groups (int): number of groups to separate the channels into\n", "- num_channels (int): number of channels expected in input\n", "- eps: a value added to the denominator for numerical stability. Default: 1e-5\n", "- affine: a boolean value that when set to ``True``, this module\n", " has learnable per-channel affine parameters initialized to ones (for weights)\n", " and zeros (for biases). Default: ``True``.\n", "\n", "Shape:\n", "- Input: `(N, num_channels, *)`\n", "- Output: `(N, num_channels, *)` (same shape as input)\n", "\n", "Examples::\n", "\n", " >>> input = torch.randn(20, 6, 10, 10)\n", " >>> # Separate 6 channels into 3 groups\n", " >>> m = nn.GroupNorm(3, 6)\n", " >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)\n", " >>> m = nn.GroupNorm(6, 6)\n", " >>> # Put all 6 channels into a single group (equivalent with LayerNorm)\n", " >>> m = nn.GroupNorm(1, 6)\n", " >>> # Activating the module\n", " >>> output = m(input)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fix small batch sizes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What's the problem?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When we compute the statistics (mean and std) for a BatchNorm Layer on a small batch, it is possible that we get a standard deviation very close to 0. because there aren't many samples (the variance of one thing is 0. since it's equal to its mean)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=7304)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = DataBunch(*get_dls(train_ds, valid_ds, 2), c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1))\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [2.35619296875, tensor(0.1649, device='cuda:0')]\n", "valid: [2867.7198, tensor(0.2604, device='cuda:0')]\n", "CPU times: user 1min 32s, sys: 835 ms, total: 1min 33s\n", "Wall time: 1min 33s\n" ] } ], "source": [ "%time run.fit(1, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running Batch Norm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To solve this problem we introduce a Running BatchNorm that uses smoother running mean and variance for the mean and std." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=7516)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class RunningBatchNorm(nn.Module):\n", " def __init__(self, nf, mom=0.1, eps=1e-5):\n", " super().__init__()\n", " self.mom,self.eps = mom,eps\n", " self.mults = nn.Parameter(torch.ones (nf,1,1))\n", " self.adds = nn.Parameter(torch.zeros(nf,1,1))\n", " self.register_buffer('sums', torch.zeros(1,nf,1,1))\n", " self.register_buffer('sqrs', torch.zeros(1,nf,1,1))\n", " self.register_buffer('batch', tensor(0.))\n", " self.register_buffer('count', tensor(0.))\n", " self.register_buffer('step', tensor(0.))\n", " self.register_buffer('dbias', tensor(0.))\n", "\n", " def update_stats(self, x):\n", " bs,nc,*_ = x.shape\n", " self.sums.detach_()\n", " self.sqrs.detach_()\n", " dims = (0,2,3)\n", " s = x.sum(dims, keepdim=True)\n", " ss = (x*x).sum(dims, keepdim=True)\n", " c = self.count.new_tensor(x.numel()/nc)\n", " mom1 = 1 - (1-self.mom)/math.sqrt(bs-1)\n", " self.mom1 = self.dbias.new_tensor(mom1)\n", " self.sums.lerp_(s, self.mom1)\n", " self.sqrs.lerp_(ss, self.mom1)\n", " self.count.lerp_(c, self.mom1)\n", " self.dbias = self.dbias*(1-self.mom1) + self.mom1\n", " self.batch += bs\n", " self.step += 1\n", "\n", " def forward(self, x):\n", " if self.training: self.update_stats(x)\n", " sums = self.sums\n", " sqrs = self.sqrs\n", " c = self.count\n", " if self.step<100:\n", " sums = sums / self.dbias\n", " sqrs = sqrs / self.dbias\n", " c = c / self.dbias\n", " means = sums/c\n", " vars = (sqrs/c).sub_(means*means)\n", " if bool(self.batch < 20): vars.clamp_min_(0.01)\n", " x = (x-means).div_((vars.add_(self.eps)).sqrt())\n", " return x.mul_(self.mults).add_(self.adds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def conv_rbn(ni, nf, ks=3, stride=2, bn=True, **kwargs):\n", " layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),\n", " GeneralRelu(**kwargs)]\n", " if bn: layers.append(RunningBatchNorm(nf))\n", " return nn.Sequential(*layers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.4, conv_rbn, cbs=cbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.157932021484375, tensor(0.9511, device='cuda:0')]\n", "valid: [0.0986408935546875, tensor(0.9730, device='cuda:0')]\n", "CPU times: user 16.5 s, sys: 1.57 s, total: 18.1 s\n", "Wall time: 18.1 s\n" ] } ], "source": [ "%time run.fit(1, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This solves the small batch size issue!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What can we do in a single epoch?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's see with a decent batch size what result we can get." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Jump_to lesson 10 video](https://course19.fast.ai/videos/?lesson=10&t=8068)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = DataBunch(*get_dls(train_ds, valid_ds, 32), c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn,run = get_learn_run(nfs, data, 0.9, conv_rbn, cbs=cbfs\n", " +[partial(ParamScheduler,'lr', sched_lin(1., 0.2))])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: [0.1573110546875, tensor(0.9521, device='cuda:0')]\n", "valid: [0.09242745971679688, tensor(0.9818, device='cuda:0')]\n", "CPU times: user 16.6 s, sys: 1.52 s, total: 18.1 s\n", "Wall time: 18.2 s\n" ] } ], "source": [ "%time run.fit(1, learn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Export" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "IPython.notebook.kernel.execute('!./notebook2script.py ' + IPython.notebook.notebook_name )" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nb_auto_export()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }