{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from nb_008 import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Rossmann"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To create the feature-engineered filed train_clean and test_clean from the initial data, run x_009a_rossman_data_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH = Path('data/rossmann/')\n",
    "train_df = pd.read_feather(PATH/'train_clean')\n",
    "test_df = pd.read_feather(PATH/'test_clean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',\n",
    "    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',\n",
    "    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',\n",
    "    'SchoolHoliday_fw', 'SchoolHoliday_bw']\n",
    "\n",
    "cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',\n",
    "   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', \n",
    "   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',\n",
    "   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']\n",
    "\n",
    "n = len(train_df); n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = np.random.permutation(range(n))[:2000]\n",
    "idx.sort()\n",
    "small_train_df = train_df.iloc[idx[:1000]]\n",
    "small_test_df = train_df.iloc[idx[1000:]]\n",
    "small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']\n",
    "small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']\n",
    "small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]\n",
    "small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "StrList = Collection[str]\n",
    "@dataclass\n",
    "class TabularTransform():\n",
    "    \"A transform for tabular dataframe\"\n",
    "    cat_names:StrList\n",
    "    cont_names:StrList\n",
    "        \n",
    "    def __call__(self, df:DataFrame, test:bool=False):\n",
    "        \"Applies the correct function to `df` depending if it's the training dataframe or not\"\n",
    "        func = self.apply_test if test else self.apply_train\n",
    "        func(df)\n",
    "        \n",
    "    def apply_train(self, df:DataFrame):\n",
    "        \"Function applied to `df` if it's the train set\"\n",
    "        raise NotImplementedError\n",
    "    def apply_test(self, df:DataFrame):\n",
    "        \"Function applied to `df` if it's the test set\"\n",
    "        self.apply_train(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class Categorify(TabularTransform):\n",
    "    \"Transforms the categorical variables to that type.\"\n",
    "    \n",
    "    def apply_train(self, df:DataFrame):\n",
    "        self.categories = {}\n",
    "        for n in self.cat_names: \n",
    "            df[n] = df[n].astype('category').cat.as_ordered()\n",
    "            self.categories[n] = df[n].cat.categories\n",
    "    \n",
    "    def apply_test(self, df:DataFrame):\n",
    "        for n in self.cat_names:\n",
    "            df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorify = Categorify(small_cat_vars, small_cont_vars)\n",
    "categorify(small_train_df)\n",
    "categorify(small_test_df, test=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_train_df['PromoInterval'].cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_test_df['Store'].cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON CONSTANT')\n",
    "\n",
    "@dataclass\n",
    "class FillMissing(TabularTransform):\n",
    "    \"Fill the missing values in continuous columns\"\n",
    "    fill_strategy:FillStrategy=FillStrategy.MEDIAN\n",
    "    add_col:bool=True\n",
    "    fill_val:float=0.\n",
    "        \n",
    "    def apply_train(self, df:DataFrame):\n",
    "        self.na_dict = {}\n",
    "        for name in self.cont_names:\n",
    "            if pd.isnull(df[name]).sum():\n",
    "                if self.add_col: \n",
    "                    df[name+'_na'] = pd.isnull(df[name])\n",
    "                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')\n",
    "                if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median() \n",
    "                elif self.fill_strategy == FillStrategy.CONSTANT: filler = self.fill_val\n",
    "                else: filler = df[name].dropna().value_counts().idxmax()\n",
    "                df[name] = df[name].fillna(filler)\n",
    "                self.na_dict[name] = filler\n",
    "            \n",
    "    def apply_test(self, df:DataFrame): \n",
    "        for name in self.cont_names:\n",
    "            if name in self.na_dict:\n",
    "                if self.add_col: \n",
    "                    df[name+'_na'] = pd.isnull(df[name])\n",
    "                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')\n",
    "                df[name] = df[name].fillna(self.na_dict[name])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_missing = FillMissing(small_cat_vars, small_cont_vars)\n",
    "fill_missing(small_train_df)\n",
    "fill_missing(small_test_df, test=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_train_df[small_train_df['CompetitionDistance_na'] == True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_test_df[small_test_df['CompetitionDistance_na'] == True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from pandas.api.types import is_numeric_dtype, is_categorical_dtype\n",
    "\n",
    "OptStrList = Optional[StrList]\n",
    "OptStats = Optional[Tuple[np.ndarray, np.ndarray]]\n",
    "OptTabTfms = Optional[Collection[TabularTransform]]\n",
    "OptDataFrame = Optional[DataFrame]\n",
    "\n",
    "class TabularDataset(DatasetBase):\n",
    "    \"Class for tabular data\"\n",
    "    def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None, \n",
    "                 stats:OptStats=None, log_output:bool=False):\n",
    "        if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes\n",
    "        self.y = torch.tensor(df[dep_var].values)\n",
    "        if log_output: self.y = torch.log(self.y.float())\n",
    "        n = len(self.y)\n",
    "        if cat_names and len(cat_names) >= 1:\n",
    "            self.cats = np.stack([c.cat.codes.values for n,c in df[cat_names].items()], 1) + 1\n",
    "        else: self.cats = np.zeros((n,1))\n",
    "        self.cats = LongTensor(self.cats.astype(np.int64))\n",
    "        if cont_names and len(cont_names) >= 1:\n",
    "            self.conts = np.stack([c.astype('float32').values for n,c in df[cont_names].items()], 1)\n",
    "            means, stds = stats if stats is not None else (self.conts.mean(0), self.conts.std(0))\n",
    "            self.conts = (self.conts - means[None]) / stds[None]\n",
    "            self.stats = means,stds\n",
    "        else: \n",
    "            self.conts = np.zeros((n,1), dtype=np.float32)\n",
    "            self.stats = None\n",
    "        self.conts = FloatTensor(self.conts)\n",
    "    \n",
    "    def __len__(self) -> int: return len(self.y)\n",
    "    def __getitem__(self, idx) -> Tuple[Tuple[LongTensor,FloatTensor], Tensor]: \n",
    "        return ((self.cats[idx], self.conts[idx]), self.y[idx])\n",
    "    @property\n",
    "    def c(self) -> int: return 1\n",
    "    \n",
    "    \n",
    "    @classmethod\n",
    "    def from_dataframe(cls, df:DataFrame, dep_var:str, tfms:OptTabTfms=None, cat_names:OptStrList=None, \n",
    "                       cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False) -> 'TabularDataset':\n",
    "        \"Creates a tabular dataframe from df after applying optional transforms\"\n",
    "        if cat_names is None: cat_names = [n for n in df.columns if is_categorical_dtype(df[n])]\n",
    "        if cont_names is None: cont_names = [n for n in df.columns if is_numeric_dtype(df[n]) and not n==dep_var]\n",
    "        if tfms is None: tfms = []\n",
    "        for i,tfm in enumerate(tfms):\n",
    "            if isinstance(tfm, TabularTransform): tfm(df, test=True)\n",
    "            else:\n",
    "                tfm = tfm(cat_names, cont_names)\n",
    "                tfm(df)\n",
    "                tfms[i] = tfm\n",
    "                cat_names, cont_names = tfm.cat_names, tfm.cont_names\n",
    "        ds = cls(df, dep_var, cat_names, cont_names, stats, log_output)\n",
    "        ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names\n",
    "        return ds\n",
    "    \n",
    "def data_from_tabulardf(path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None, \n",
    "                        tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None, \n",
    "                        stats:OptStats=None, log_output:bool=False, **kwargs) -> DataBunch:\n",
    "    \"Creates a `DataBunch` from train/valid/test dataframes.\"\n",
    "    train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output)\n",
    "    valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, \n",
    "                                             train_ds.cont_names, train_ds.stats, log_output)\n",
    "    datasets = [train_ds, valid_ds]\n",
    "    if test_df:\n",
    "        datasets.appendTabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, \n",
    "                                                     train_ds.cont_names, train_ds.stats, log_output)\n",
    "    return DataBunch.create(*datasets, path=path, **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_feather(PATH/'train_clean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = np.random.permutation(range(n))[:2000]\n",
    "idx.sort()\n",
    "small_train_df = train_df.iloc[idx[:1000]]\n",
    "small_test_df = train_df.iloc[idx[1000:]]\n",
    "small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']\n",
    "small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']\n",
    "small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]\n",
    "small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dep_var = 'Sales'\n",
    "tfms = [FillMissing, Categorify] #Fillmissing first so that the added columns are categorified\n",
    "train_ds = TabularDataset.from_dataframe(small_train_df, dep_var, tfms, small_cat_vars, \n",
    "                                                    small_cont_vars, log_output=True)\n",
    "valid_ds = TabularDataset.from_dataframe(small_test_df, dep_var, train_ds.tfms, train_ds.cat_names, \n",
    "                                                   train_ds.cont_names, train_ds.stats, log_output=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds.stats, valid_ds.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "small_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',\n",
    "    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',\n",
    "    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',\n",
    "    'SchoolHoliday_fw', 'SchoolHoliday_bw']\n",
    "\n",
    "cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',\n",
    "   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', \n",
    "   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',\n",
    "   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dep_var = 'Sales'\n",
    "train_df = pd.read_feather(PATH/'train_clean')\n",
    "train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['Date'].min(), test_df['Date'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()\n",
    "cut"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = train_df.set_index('Date')\n",
    "train_df,valid_df = train_df[cut:], train_df[:cut]\n",
    "len(train_df),len(valid_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfms = [FillMissing, Categorify]\n",
    "data = data_from_tabulardf(PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_vars, \n",
    "                           cont_names=cont_vars, log_output=True, num_workers=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "ListSizes = Collection[Tuple[int,int]]\n",
    "OptRange = Optional[Tuple[float,float]]\n",
    "\n",
    "class TabularModel(nn.Module):\n",
    "    \"Basic model for tabular data\"\n",
    "    \n",
    "    def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], drops:Collection[float], \n",
    "                 emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, is_reg:bool=False, is_multi:bool=False):\n",
    "        super().__init__()\n",
    "        self.embeds = nn.ModuleList([get_embedding(ni, nf) for ni,nf in emb_szs])\n",
    "        self.emb_drop = nn.Dropout(emb_drop)\n",
    "        self.bn_cont = nn.BatchNorm1d(n_cont)\n",
    "        n_emb = sum(e.embedding_dim for e in self.embeds)\n",
    "        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range\n",
    "        if is_reg: final_act = None if y_range is None else nn.Sigmoid()\n",
    "        else:      final_act = nn.LogSoftmax() if is_multi else nn.Sigmoid()\n",
    "        sizes = [n_emb + n_cont] + layers + [out_sz]\n",
    "        actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [final_act]\n",
    "        layers = []\n",
    "        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+drops,actns)):\n",
    "            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)\n",
    "        self.layers = nn.Sequential(*layers)\n",
    "    \n",
    "    def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor:\n",
    "        if self.n_emb != 0:\n",
    "            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]\n",
    "            x = torch.cat(x, 1)\n",
    "            x = self.emb_drop(x)\n",
    "        if self.n_cont != 0:\n",
    "            x_cont = self.bn_cont(x_cont)\n",
    "            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont\n",
    "        x = self.layers(x)\n",
    "        if self.y_range is not None: x = (self.y_range[1] - self.y_range[0]) * x + self.y_range[0]\n",
    "        return x.squeeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars]\n",
    "emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]\n",
    "emb_szs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_log_y = np.log(np.max(train_df['Sales']))\n",
    "y_range = torch.tensor([0, max_log_y*1.2], device=default_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range, is_reg=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def exp_rmspe(pred:Tensor, targ:Tensor) -> Rank0Tensor:\n",
    "    pred, targ = torch.exp(pred), torch.exp(targ)\n",
    "    pct_var = (targ - pred)/targ\n",
    "    return torch.sqrt((pct_var**2).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = Learner(data, model)\n",
    "learn.loss_fn = F.mse_loss\n",
    "learn.metrics = [exp_rmspe]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.lr_find()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.recorder.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with torch.no_grad():\n",
    "    pct_var,cnt = 0.,0\n",
    "    for x,y in learn.data.valid_dl:\n",
    "        out = learn.model(*x)\n",
    "        cnt += y.size(0)\n",
    "        y, out = torch.exp(y), torch.exp(out)\n",
    "        pct_var += ((y - out)/y).pow(2).sum()\n",
    "torch.sqrt(pct_var/cnt).item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}