{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from nb_004 import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_PATH = Path('data')\n",
    "PATH = DATA_PATH/'cifar10'\n",
    "\n",
    "data_mean,data_std = map(tensor, ([0.491, 0.482, 0.447], [0.247, 0.243, 0.261]))\n",
    "cifar_norm,cifar_denorm = normalize_funcs(data_mean,data_std)\n",
    "\n",
    "train_tfms = [flip_lr(p=0.5),\n",
    "              pad(padding=4),\n",
    "              crop(size=32, row_pct=(0,1.), col_pct=(0,1.))]\n",
    "valid_tfms = []\n",
    "\n",
    "bs = 64"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tweaks to the OptimWrapper to handle an array of lrs/wds/..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will modify OptimWrapper so that it accepts lists of hyperparameters (learning rate, weight decay, momentum, beta, alpha). We will use this by first defining a set of groups of layers and then defining one hyperparameter value for each group.\n",
    "\n",
    "An example of this is _Discriminative learning rates_, which consists of using different learning rates on different layers during training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "ModuleList = Collection[nn.Module]\n",
    "ParamList = Collection[nn.Parameter]\n",
    "\n",
    "bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)\n",
    "\n",
    "def requires_grad(l:nn.Module, b:Optional[bool]=None)->Optional[bool]:\n",
    "    \"If b is not set requires_grad on all params in l, else return requires_grad of first param\"\n",
    "    ps = list(l.parameters())\n",
    "    if not ps: return None\n",
    "    if b is None: return ps[0].requires_grad\n",
    "    for p in ps: p.requires_grad=b\n",
    "\n",
    "def trainable_params(m:nn.Module)->ParamList: \n",
    "    \"Return list of trainable params in `m`\"\n",
    "    res = filter(lambda p: p.requires_grad, m.parameters())\n",
    "    return res\n",
    "\n",
    "def split_bn_bias(layer_groups:ModuleList)->ModuleList:\n",
    "    \"Sort each layer in  `layer_groups` into batchnorm (`bn_types`) and non-batchnorm groups\"\n",
    "    split_groups = []\n",
    "    for l in layer_groups:\n",
    "        l1,l2 = [],[]\n",
    "        for c in l.children():\n",
    "            if isinstance(c, bn_types): l2.append(c)\n",
    "            else:                       l1.append(c)\n",
    "        split_groups += [nn.Sequential(*l1), nn.Sequential(*l2)]\n",
    "    return split_groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(optim.SGD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class OptimWrapper():\n",
    "    \"Basic wrapper around an optimizer to simplify HP changes\"\n",
    "    def __init__(self, opt:optim.Optimizer, wd:Floats=0., true_wd:bool=False, bn_wd:bool=True)->None:\n",
    "        self.opt,self.true_wd,self.bn_wd = opt,true_wd,bn_wd\n",
    "        self.opt_keys = list(self.opt.param_groups[0].keys())\n",
    "        self.opt_keys.remove('params')\n",
    "        self.read_defaults()\n",
    "        self.wd = wd\n",
    "        \n",
    "    @classmethod\n",
    "    def create(cls, opt_fn:Union[type,Callable], lr:Union[float,Tuple,List], \n",
    "               layer_groups:ModuleList, **kwargs:Any)->optim.Optimizer:\n",
    "        \"Create an optim.Optimizer from `opt_fn` with `lr`. Set lr on `layer_groups``\"\n",
    "        split_groups = split_bn_bias(layer_groups)\n",
    "        opt = opt_fn([{'params': trainable_params(l), 'lr':0} for l in split_groups])\n",
    "        opt = cls(opt, **kwargs)\n",
    "        opt.lr = listify(lr, layer_groups)\n",
    "        return opt\n",
    "    \n",
    "    def __repr__(self)->str:\n",
    "        return f'OptimWrapper over {repr(self.opt)}.\\nTrue weight decay: {self.true_wd}'\n",
    "\n",
    "    #Pytorch optimizer methods\n",
    "    def step(self)->None:\n",
    "        \"Set weight decay and step optimizer\"\n",
    "        # weight decay outside of optimizer step (AdamW)\n",
    "        if self.true_wd:\n",
    "            for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]):\n",
    "                for p in pg1['params']: p.data.mul_(1 - wd*lr)\n",
    "                if self.bn_wd:\n",
    "                    for p in pg2['params']: p.data.mul_(1 - wd*lr)\n",
    "            self.set_val('weight_decay', listify(0, self._wd))\n",
    "        self.opt.step()\n",
    "    \n",
    "    def zero_grad(self)->None: \n",
    "        \"Clear optimizer gradients\"\n",
    "        self.opt.zero_grad()\n",
    "    \n",
    "    #Hyperparameters as properties\n",
    "    @property\n",
    "    def lr(self)->float: \n",
    "        \"Get learning rate\"\n",
    "        return self._lr[-1]\n",
    "\n",
    "    @lr.setter\n",
    "    def lr(self, val:float)->None: \n",
    "        \"Set learning rate\"\n",
    "        self._lr = self.set_val('lr', listify(val, self._lr))\n",
    "    \n",
    "    @property\n",
    "    def mom(self)->float: \n",
    "        \"Get momentum\"\n",
    "        return self._mom[-1]\n",
    "\n",
    "    @mom.setter\n",
    "    def mom(self, val:float)->None:\n",
    "        \"Set momentum\"\n",
    "        if 'momentum' in self.opt_keys: self.set_val('momentum', listify(val, self._mom))\n",
    "        elif 'betas' in self.opt_keys:  self.set_val('betas', (listify(val, self._mom), self._beta))\n",
    "        self._mom = listify(val, self._mom)\n",
    "    \n",
    "    @property\n",
    "    def beta(self)->float: \n",
    "        \"get beta\"\n",
    "        return None if self._beta is None else self._beta[-1]\n",
    "\n",
    "    @beta.setter\n",
    "    def beta(self, val:float)->None:\n",
    "        \"Set beta (or alpha as makes sense for give optimizer)\"\n",
    "        if val is None: return\n",
    "        if 'betas' in self.opt_keys:    self.set_val('betas', (self._mom, listify(val, self._beta)))\n",
    "        elif 'alpha' in self.opt_keys:  self.set_val('alpha', listify(val, self._beta))\n",
    "        self._beta = listify(val, self._beta)\n",
    "    \n",
    "    @property\n",
    "    def wd(self)->float: \n",
    "        \"Get weight decay\"\n",
    "        return self._wd[-1]\n",
    "\n",
    "    @wd.setter\n",
    "    def wd(self, val:float)->None:\n",
    "        \"Set weight decay\"\n",
    "        if not self.true_wd: self.set_val('weight_decay', listify(val, self._wd), bn_groups=self.bn_wd)\n",
    "        self._wd = listify(val, self._wd)\n",
    "    \n",
    "    #Helper functions\n",
    "    def read_defaults(self)->None:\n",
    "        \"Read the values inside the optimizer for the hyper-parameters\"\n",
    "        self._beta = None\n",
    "        if 'lr' in self.opt_keys: self._lr = self.read_val('lr')\n",
    "        if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')\n",
    "        if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')\n",
    "        if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas')\n",
    "        if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')\n",
    "    \n",
    "    def set_val(self, key:str, val:Any, bn_groups:bool=True)->Any:\n",
    "        \"Set the values inside the optimizer dictionary at the key\"\n",
    "        if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)]\n",
    "        for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]): \n",
    "            pg1[key] = v\n",
    "            if bn_groups: pg2[key] = v\n",
    "        return val\n",
    "    \n",
    "    def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]:\n",
    "        \"Read a hyper-parameter key in the optimizer dictionary.\"\n",
    "        val = [pg[key] for pg in self.opt.param_groups[::2]]\n",
    "        if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]\n",
    "        return val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def children(m:nn.Module)->ModuleList: \n",
    "    \"Get children of module\"\n",
    "    return list(m.children())\n",
    "def num_children(m:nn.Module)->int: \n",
    "    \"Get number of child modules in module\"\n",
    "    return len(children(m))\n",
    "def range_children(m:nn.Module)->Iterator[int]: \n",
    "    \"Return iterator of len of children of m\"\n",
    "    return range(num_children(m))\n",
    "\n",
    "flatten_model=lambda l: sum(map(flatten_model,l.children()),[]) if num_children(l) else [l]\n",
    "def first_layer(m:nn.Module)->nn.Module:\n",
    "    \"Retrieve first layer in a module\"\n",
    "    return flatten_model(m)[0]\n",
    "\n",
    "def split_model_idx(model:nn.Module, idxs:Collection[int])->ModuleList:\n",
    "    \"Split the model according to the indices in [idxs]\"\n",
    "    layers = flatten_model(model)\n",
    "    if idxs[0] != 0: idxs = [0] + idxs\n",
    "    if idxs[-1] != len(layers): idxs.append(len(layers))\n",
    "    return [nn.Sequential(*layers[i:j]) for i,j in zip(idxs[:-1],idxs[1:])]\n",
    "\n",
    "def split_model(model:nn.Module, splits:Collection[ModuleList], want_idxs:bool=False):\n",
    "    \"Split the model according to the layers in [splits]\"\n",
    "    layers = flatten_model(model)\n",
    "    idxs = [layers.index(first_layer(s)) for s in listify(splits)]\n",
    "    res = split_model_idx(model, idxs)\n",
    "    return (res,idxs) if want_idxs else res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will now try our Wrapper with the Darknet neural net architecture."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are now going to split the model in layers so that we can try our OptimWrapper with more than one value per hyperparameter. In particular, we are going to split the 18 layers in the model into three groups: 0-5, 6-9 and 10-18. Afterwards, when setting the value for our hyperparameters, we will need to define a different value for each of these groups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits = [model.layers[9], model.layers[15]]\n",
    "layer_groups,idxs = split_model(model, splits, want_idxs=True)\n",
    "lrs = np.array([1e-3,1e-2,0.1])\n",
    "tst_opt = OptimWrapper.create(optim.SGD, lrs, layer_groups, bn_wd=False)\n",
    "idxs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(tst_opt.opt.param_groups), tst_opt.opt.param_groups[0]['weight_decay']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.opt.param_groups[0]['lr'],tst_opt.opt.param_groups[1]['lr']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.wd = 1e-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.opt.param_groups[0]['weight_decay'],tst_opt.opt.param_groups[1]['weight_decay']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.lr = 1e-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.opt.param_groups[0]['lr'],tst_opt.opt.param_groups[1]['lr']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can set the hyperparameters in two ways:\n",
    "\n",
    "1. optimizer_object.\\_hyperparameter = [val1, val2 ..., valn] (n = number of layer groups)\n",
    "\n",
    "2. optimizer_object.hyperparameter = val\n",
    "\n",
    "If we chose to set it in way 1, we must specify a number of values exactly equal to the number of layer groups. If we chose to set it in way 2, the chosen value will be repeated for all layer groups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.lr, tst_opt._lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst_opt.wd, tst_opt._wd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# expect exception\n",
    "# tst_opt.wd = [0.1,0.1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# expect exception\n",
    "# tst_opt.lr = [0.1,0.1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Now let's tweak the learner to handle this."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have added some important functions into the Learner class. We have built *create_opt* that creates the optimizer with the wrapper, _split_ that splits our model into layer groups, *freeze_to*, _freeze_ and _unfreeze_ that allow us to freeze and unfreeze parts of the network or even the whole network.\n",
    "\n",
    "Add gradient clipping at this stage in a callback."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)\n",
    "\n",
    "def set_bn_eval(m:nn.Module)->None:\n",
    "    \"Set bn layers in eval mode for all recursive children of m\"\n",
    "    for l in m.children():\n",
    "        if isinstance(l, bn_types) and not next(l.parameters()).requires_grad:\n",
    "            l.eval()\n",
    "        set_bn_eval(l)\n",
    "\n",
    "@dataclass\n",
    "class BnFreeze(Callback):\n",
    "    \"Set all bntypes layers in `learn` to eval() on_epoch_begin\"\n",
    "    learn:Learner\n",
    "    def on_epoch_begin(self, **kwargs:Any)->None: \n",
    "        \"Put bn layers in eval mode on epoch_begin\"\n",
    "        set_bn_eval(self.learn.model)\n",
    "\n",
    "def even_mults(start:float, stop:float, n:int)->np.ndarray:\n",
    "    \"Build evenly stepped schedule from start to stop in n steps\"\n",
    "    mult = stop/start\n",
    "    step = mult**(1/(n-1))\n",
    "    return np.array([start*(step**i) for i in range(n)])\n",
    "\n",
    "default_lr = slice(3e-3)\n",
    "default_wd = 1e-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "\n",
    "SplitFuncOrIdxList = Union[Callable, Collection[ModuleList]]\n",
    "@dataclass\n",
    "class Learner():\n",
    "    \"Object that wraps together some data, a model, a loss function and an optimizer\"\n",
    "    data:DataBunch\n",
    "    model:nn.Module\n",
    "    opt_fn:Callable=AdamW\n",
    "    loss_fn:Callable=F.cross_entropy\n",
    "    metrics:Collection[Callable]=None\n",
    "    true_wd:bool=True\n",
    "    bn_wd:bool=True\n",
    "    wd:Floats=default_wd\n",
    "    train_bn:bool=True\n",
    "    path:str = None\n",
    "    model_dir:str = 'models'\n",
    "    callback_fns:Collection[Callable]=None\n",
    "    callbacks:Collection[Callback]=field(default_factory=list)\n",
    "    layer_groups:Collection[nn.Module]=None\n",
    "    def __post_init__(self)->None:\n",
    "        \"Setup path,metrics, callbacks and ensure model directory exists\"\n",
    "        self.path = Path(ifnone(self.path, self.data.path))\n",
    "        (self.path/self.model_dir).mkdir(parents=True, exist_ok=True)\n",
    "        self.model = self.model.to(self.data.device)\n",
    "        self.metrics=listify(self.metrics)\n",
    "        if not self.layer_groups: self.layer_groups = [nn.Sequential(*flatten_model(self.model))]\n",
    "        self.callbacks = listify(self.callbacks)\n",
    "        self.callback_fns = [Recorder] + listify(self.callback_fns)\n",
    "\n",
    "    def lr_range(self, lr:Union[float,slice])->np.ndarray:\n",
    "        \"Build learning rate schedule\"\n",
    "        if not isinstance(lr,slice): return lr\n",
    "        if lr.start: res = even_mults(lr.start, lr.stop, len(self.layer_groups))\n",
    "        else: res = [lr.stop/3]*(len(self.layer_groups)-1) + [lr.stop]\n",
    "        return np.array(res)\n",
    "        \n",
    "    def fit(self, epochs:int, lr:Union[Floats,slice]=default_lr, \n",
    "            wd:Floats=None, callbacks:Collection[Callback]=None)->None:\n",
    "        \"fit the model on this learner with `lr` learning rate, `wd` weight decay for `epochs` with `callbacks`\"\n",
    "        lr = self.lr_range(lr)\n",
    "        if wd is None: wd = self.wd\n",
    "        self.create_opt(lr, wd)\n",
    "        callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)\n",
    "        fit(epochs, self.model, self.loss_fn, opt=self.opt, data=self.data, metrics=self.metrics,\n",
    "            callbacks=self.callbacks+callbacks)\n",
    "\n",
    "    def create_opt(self, lr:Floats, wd:Floats=0.)->None:\n",
    "        \"create optimizer with `lr` learning rate and `wd` weight decay\"\n",
    "        self.opt = OptimWrapper.create(self.opt_fn, lr, self.layer_groups, wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)\n",
    "        \n",
    "    def split(self, split_on:SplitFuncOrIdxList)->None:\n",
    "        \"split the model at `split_on`\"\n",
    "        if isinstance(split_on,Callable): self.layer_groups = split_on(self.model)\n",
    "        else: self.layer_groups = split_model(self.model, split_on)\n",
    "\n",
    "    def freeze_to(self, n:int)->None:\n",
    "        \"freeze layers up to layer `n`\"\n",
    "        for g in self.layer_groups[:n]:\n",
    "            for l in g:\n",
    "                if not self.train_bn or not isinstance(l, bn_types): requires_grad(l, False)\n",
    "        for g in self.layer_groups[n:]: requires_grad(g, True)\n",
    "\n",
    "    def freeze(self)->None:\n",
    "        \"freeze up to last layer\"\n",
    "        assert(len(self.layer_groups)>1)\n",
    "        self.freeze_to(-1)\n",
    "        \n",
    "    def unfreeze(self): \n",
    "        \"unfreeze entire model\"\n",
    "        self.freeze_to(0)\n",
    "    def __del__(self): del(self.model, self.data)        \n",
    "    def save(self, name:PathOrStr): \n",
    "        \"save model with `name` to `self.model_dir`\"\n",
    "        torch.save(self.model.state_dict(), self.path/self.model_dir/f'{name}.pth')\n",
    "    def load(self, name:PathOrStr): \n",
    "        \"load model `name` from `self.model_dir\"\n",
    "        self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def fit_one_cycle(learn:Learner, cyc_len:int,\n",
    "                  max_lr:Union[Floats,slice]=default_lr, moms:Tuple[float,float]=(0.95,0.85),\n",
    "                  div_factor:float=25., pct_start:float=0.3, wd:float=None, **kwargs)->None:\n",
    "    \"Fits a model following the 1cycle policy\"\n",
    "    max_lr = learn.lr_range(max_lr)\n",
    "    cbs = [OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor,\n",
    "                             pct_start=pct_start, **kwargs)]\n",
    "    learn.fit(cyc_len, max_lr, wd=wd, callbacks=cbs)\n",
    "\n",
    "Learner.fit_one_cycle = fit_one_cycle\n",
    "Learner.lr_find = lr_find"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds = ImageDataset.from_folder(PATH/'train', classes=['airplane','dog'])\n",
    "valid_ds = ImageDataset.from_folder(PATH/'test', classes=['airplane','dog'])\n",
    "data = DataBunch.create(train_ds, valid_ds, bs=bs, train_tfm=train_tfms, valid_tfm=valid_tfms, dl_tfms=cifar_norm)\n",
    "len(data.train_dl), len(data.valid_dl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)\n",
    "learn = Learner(data, model, true_wd=True)\n",
    "learn.metrics = [accuracy]\n",
    "learn.split((model.layers[9],model.layers[15]))\n",
    "learn.freeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1,l2 = model.layers[0],model.layers[-1]; l1,l2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requires_grad(l1[0]),requires_grad(l1[1]),requires_grad(l2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we know, setting a hyperparameter to a list of length different from the number of layer groups will return an error."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# expect exception\n",
    "# learn.fit(1, [0.1,0.1], wd=1e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit_one_cycle(1, slice(1e-1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.recorder.plot_lr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit(1, 1e-1, wd=1e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.opt._lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit(1, slice(1e-3,1e-1), wd=1e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.opt._lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.unfreeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "requires_grad(l1[0]),requires_grad(l1[1]),requires_grad(l2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit(1, lrs, wd=1e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)\n",
    "learn = Learner(data, model)\n",
    "learn.metrics = [accuracy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit(1,0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.opt._lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def darknet_split(m) : return split_model(m,[m.layers[9],m.layers[15]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)\n",
    "learn = Learner(data, model, metrics=[accuracy])\n",
    "learn.split(darknet_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(learn.layer_groups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit(1, lrs, wd=[1e-4,1e-3,1e-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.opt._wd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# exception expected\n",
    "# learn.fit(1, lrs, wd=[1e-4,1e-3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "LRs and WDs are the easiest to pass through the Learner, but if a Callback sets an array of moms or betas, the OptimWrapper will handle them as discriminative moms/betas."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## See how it fits with the other callbacks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)\n",
    "learn = Learner(data, model, metrics = [accuracy])\n",
    "learn.split(darknet_split)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_find(learn, start_lr=lrs/1000, end_lr=lrs*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.recorder.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sched = OneCycleScheduler(learn, lrs)\n",
    "learn.fit(1, lrs, callbacks=[sched])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}