{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Planet multi-target classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from nb_005a import *\n",
    "import pandas as pd\n",
    "from pandas import Series\n",
    "import csv\n",
    "from collections import OrderedDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH = Path('data/planet')\n",
    "TRAIN = 'train-jpg'\n",
    "label_csv = PATH/'train_v2.csv'\n",
    "bs = 64"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "norm,denorm = normalize_funcs(*imagenet_stats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(label_csv, header=0)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[:,1] = list(csv.reader(df.iloc[:,1], delimiter=' '))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tst = df.iloc[:,1].apply(lambda x: len(x))\n",
    "tst.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(tst != 1).any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def uniqueify(x:Series) -> List[Any]: return list(OrderedDict.fromkeys(x).keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = uniqueify(np.concatenate(df.tags))\n",
    "' '.join(classes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class FilesMultiDataset(LabelDataset):\n",
    "    def __init__(self, fns, labels, classes=None):\n",
    "        self.classes = ifnone(classes, uniqueify(np.concatenate(labels)))\n",
    "        self.class2idx = {v:k for k,v in enumerate(self.classes)}\n",
    "        self.x = np.array(fns)\n",
    "        self.y = [np.array([self.class2idx[o] for o in l], dtype=np.int64)\n",
    "                  for l in labels]\n",
    "\n",
    "    def encode(self, x):\n",
    "        res = np.zeros((self.c,), np.float32)\n",
    "        res[x] = 1.\n",
    "        return res\n",
    "    \n",
    "    def get_labels(self, idx): return [self.classes[i] for i in self.y[idx]]\n",
    "    def __getitem__(self,i): return open_image(self.x[i]), self.encode(self.y[i])\n",
    "    \n",
    "    @classmethod\n",
    "    def from_random_split(cls, path, folder, fns, labels, valid_pct, classes=None, test_name=None):\n",
    "        train,valid = random_split(valid_pct, f'{path}/{folder}/' + fns, labels)\n",
    "        train_ds = cls(*train, classes=classes)\n",
    "        res = [train_ds,cls(*valid, classes=train_ds.classes)]\n",
    "        if test_name:\n",
    "            test_fns = get_image_files(Path(path)/test_name)\n",
    "            test_labels = [[train_ds.classes[0]]]*len(test_fns)\n",
    "            res.append(cls(test_fns, test_labels, classes=train_ds.classes))\n",
    "        return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_path(x): return x+'.jpg'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfms = get_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "datasets = FilesMultiDataset.from_random_split(\n",
    "    PATH, TRAIN, get_path(df.image_name), df.tags, valid_pct=0.2, test_name='test-jpg')\n",
    "valid_ds = datasets[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx=0\n",
    "valid_ds[idx][0].brightness(0.7).contrast(1.5).show(\n",
    "    title=valid_ds.get_labels(idx), figsize=(5,5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_ds[1][1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data(size):\n",
    "    return DataBunch.create(*datasets, path=PATH, ds_tfms=tfms, size=size,\n",
    "                            bs=bs, num_workers=12, tfms=norm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=get_data(64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x,y = next(iter(data.valid_dl))\n",
    "x,y = x.data,y.data\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def fbeta(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, beta:float=2, eps:float=1e-9, sigmoid:bool=True) -> Rank0Tensor:\n",
    "    \"Computes the f_beta between preds and targets\"\n",
    "    beta2 = beta**2\n",
    "    if sigmoid: y_pred = y_pred.sigmoid()\n",
    "    y_pred = (y_pred>thresh).float()\n",
    "    y_true = y_true.float()\n",
    "    TP = (y_pred*y_true).sum(dim=1)\n",
    "    prec = TP/(y_pred.sum(dim=1)+eps)\n",
    "    rec = TP/(y_true.sum(dim=1)+eps)\n",
    "    res = (prec*rec)/(prec*beta2+rec+eps)*(1+beta2)\n",
    "    return res.mean()\n",
    "\n",
    "def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True) -> Rank0Tensor:\n",
    "    if sigmoid: y_pred = y_pred.sigmoid()\n",
    "    return ((y_pred>thresh)==y_true.byte()).float().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f2_opt(y_pred, y_true, start=0.15, end=0.25, step=0.01):\n",
    "    return max([fbeta(y_pred, y_true, th)\n",
    "                for th in np.arange(start,end,step)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics = [accuracy_thresh, f2_opt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = ConvLearner(data, tvm.resnet34, metrics=metrics, ps=0.4, loss_fn=nn.BCEWithLogitsLoss())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.lr_find()\n",
    "learn.recorder.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_size(size, it_num, lr):\n",
    "    learn.data=get_data(size)\n",
    "    learn.freeze()\n",
    "    learn.fit_one_cycle(5, slice(lr))\n",
    "    learn.save(str(it_num))\n",
    "    learn.unfreeze()\n",
    "    learn.fit_one_cycle(5, slice(lr/20, lr/2), pct_start=0.1)\n",
    "    learn.save(str(it_num+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr=1e-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_size(64, 0, lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_size(128, 2, lr/2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_size(256, 2, lr/5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit_one_cycle(5, slice(lr/100, lr/10), pct_start=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}