{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from nb_008 import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Rossmann" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To create the feature-engineered filed train_clean and test_clean from the initial data, run x_009a_rossman_data_clean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PATH = Path('data/rossmann/')\n", "train_df = pd.read_feather(PATH/'train_clean')\n", "test_df = pd.read_feather(PATH/'test_clean')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',\n", " 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',\n", " 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',\n", " 'SchoolHoliday_fw', 'SchoolHoliday_bw']\n", "\n", "cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',\n", " 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', \n", " 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',\n", " 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']\n", "\n", "n = len(train_df); n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "idx = np.random.permutation(range(n))[:2000]\n", "idx.sort()\n", "small_train_df = train_df.iloc[idx[:1000]]\n", "small_test_df = train_df.iloc[idx[1000:]]\n", "small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']\n", "small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']\n", "small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]\n", "small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_train_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_test_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "StrList = Collection[str]\n", "@dataclass\n", "class TabularTransform():\n", " \"A transform for tabular dataframe\"\n", " cat_names:StrList\n", " cont_names:StrList\n", " \n", " def __call__(self, df:DataFrame, test:bool=False):\n", " \"Applies the correct function to `df` depending if it's the training dataframe or not\"\n", " func = self.apply_test if test else self.apply_train\n", " func(df)\n", " \n", " def apply_train(self, df:DataFrame):\n", " \"Function applied to `df` if it's the train set\"\n", " raise NotImplementedError\n", " def apply_test(self, df:DataFrame):\n", " \"Function applied to `df` if it's the test set\"\n", " self.apply_train(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class Categorify(TabularTransform):\n", " \"Transforms the categorical variables to that type.\"\n", " \n", " def apply_train(self, df:DataFrame):\n", " self.categories = {}\n", " for n in self.cat_names: \n", " df[n] = df[n].astype('category').cat.as_ordered()\n", " self.categories[n] = df[n].cat.categories\n", " \n", " def apply_test(self, df:DataFrame):\n", " for n in self.cat_names:\n", " df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "categorify = Categorify(small_cat_vars, small_cont_vars)\n", "categorify(small_train_df)\n", "categorify(small_test_df, test=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_test_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_train_df['PromoInterval'].cat.codes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_test_df['Store'].cat.codes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON CONSTANT')\n", "\n", "@dataclass\n", "class FillMissing(TabularTransform):\n", " \"Fill the missing values in continuous columns\"\n", " fill_strategy:FillStrategy=FillStrategy.MEDIAN\n", " add_col:bool=True\n", " fill_val:float=0.\n", " \n", " def apply_train(self, df:DataFrame):\n", " self.na_dict = {}\n", " for name in self.cont_names:\n", " if pd.isnull(df[name]).sum():\n", " if self.add_col: \n", " df[name+'_na'] = pd.isnull(df[name])\n", " if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')\n", " if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median() \n", " elif self.fill_strategy == FillStrategy.CONSTANT: filler = self.fill_val\n", " else: filler = df[name].dropna().value_counts().idxmax()\n", " df[name] = df[name].fillna(filler)\n", " self.na_dict[name] = filler\n", " \n", " def apply_test(self, df:DataFrame): \n", " for name in self.cont_names:\n", " if name in self.na_dict:\n", " if self.add_col: \n", " df[name+'_na'] = pd.isnull(df[name])\n", " if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')\n", " df[name] = df[name].fillna(self.na_dict[name])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fill_missing = FillMissing(small_cat_vars, small_cont_vars)\n", "fill_missing(small_train_df)\n", "fill_missing(small_test_df, test=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_train_df[small_train_df['CompetitionDistance_na'] == True]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_test_df[small_test_df['CompetitionDistance_na'] == True]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from pandas.api.types import is_numeric_dtype, is_categorical_dtype\n", "\n", "OptStrList = Optional[StrList]\n", "OptStats = Optional[Tuple[np.ndarray, np.ndarray]]\n", "OptTabTfms = Optional[Collection[TabularTransform]]\n", "OptDataFrame = Optional[DataFrame]\n", "\n", "class TabularDataset(DatasetBase):\n", " \"Class for tabular data\"\n", " def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None, \n", " stats:OptStats=None, log_output:bool=False):\n", " if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes\n", " self.y = torch.tensor(df[dep_var].values)\n", " if log_output: self.y = torch.log(self.y.float())\n", " n = len(self.y)\n", " if cat_names and len(cat_names) >= 1:\n", " self.cats = np.stack([c.cat.codes.values for n,c in df[cat_names].items()], 1) + 1\n", " else: self.cats = np.zeros((n,1))\n", " self.cats = LongTensor(self.cats.astype(np.int64))\n", " if cont_names and len(cont_names) >= 1:\n", " self.conts = np.stack([c.astype('float32').values for n,c in df[cont_names].items()], 1)\n", " means, stds = stats if stats is not None else (self.conts.mean(0), self.conts.std(0))\n", " self.conts = (self.conts - means[None]) / stds[None]\n", " self.stats = means,stds\n", " else: \n", " self.conts = np.zeros((n,1), dtype=np.float32)\n", " self.stats = None\n", " self.conts = FloatTensor(self.conts)\n", " \n", " def __len__(self) -> int: return len(self.y)\n", " def __getitem__(self, idx) -> Tuple[Tuple[LongTensor,FloatTensor], Tensor]: \n", " return ((self.cats[idx], self.conts[idx]), self.y[idx])\n", " @property\n", " def c(self) -> int: return 1\n", " \n", " \n", " @classmethod\n", " def from_dataframe(cls, df:DataFrame, dep_var:str, tfms:OptTabTfms=None, cat_names:OptStrList=None, \n", " cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False) -> 'TabularDataset':\n", " \"Creates a tabular dataframe from df after applying optional transforms\"\n", " if cat_names is None: cat_names = [n for n in df.columns if is_categorical_dtype(df[n])]\n", " if cont_names is None: cont_names = [n for n in df.columns if is_numeric_dtype(df[n]) and not n==dep_var]\n", " if tfms is None: tfms = []\n", " for i,tfm in enumerate(tfms):\n", " if isinstance(tfm, TabularTransform): tfm(df, test=True)\n", " else:\n", " tfm = tfm(cat_names, cont_names)\n", " tfm(df)\n", " tfms[i] = tfm\n", " cat_names, cont_names = tfm.cat_names, tfm.cont_names\n", " ds = cls(df, dep_var, cat_names, cont_names, stats, log_output)\n", " ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names\n", " return ds\n", " \n", "def data_from_tabulardf(path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None, \n", " tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None, \n", " stats:OptStats=None, log_output:bool=False, **kwargs) -> DataBunch:\n", " \"Creates a `DataBunch` from train/valid/test dataframes.\"\n", " train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output)\n", " valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, \n", " train_ds.cont_names, train_ds.stats, log_output)\n", " datasets = [train_ds, valid_ds]\n", " if test_df:\n", " datasets.appendTabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, \n", " train_ds.cont_names, train_ds.stats, log_output)\n", " return DataBunch.create(*datasets, path=path, **kwargs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df = pd.read_feather(PATH/'train_clean')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "idx = np.random.permutation(range(n))[:2000]\n", "idx.sort()\n", "small_train_df = train_df.iloc[idx[:1000]]\n", "small_test_df = train_df.iloc[idx[1000:]]\n", "small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']\n", "small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']\n", "small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]\n", "small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dep_var = 'Sales'\n", "tfms = [FillMissing, Categorify] #Fillmissing first so that the added columns are categorified\n", "train_ds = TabularDataset.from_dataframe(small_train_df, dep_var, tfms, small_cat_vars, \n", " small_cont_vars, log_output=True)\n", "valid_ds = TabularDataset.from_dataframe(small_test_df, dep_var, train_ds.tfms, train_ds.cat_names, \n", " train_ds.cont_names, train_ds.stats, log_output=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_ds[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_ds.stats, valid_ds.stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "small_train_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',\n", " 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',\n", " 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',\n", " 'SchoolHoliday_fw', 'SchoolHoliday_bw']\n", "\n", "cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',\n", " 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', \n", " 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',\n", " 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dep_var = 'Sales'\n", "train_df = pd.read_feather(PATH/'train_clean')\n", "train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_df['Date'].min(), test_df['Date'].max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(test_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()\n", "cut" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df = train_df.set_index('Date')\n", "train_df,valid_df = train_df[cut:], train_df[:cut]\n", "len(train_df),len(valid_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tfms = [FillMissing, Categorify]\n", "data = data_from_tabulardf(PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_vars, \n", " cont_names=cont_vars, log_output=True, num_workers=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "ListSizes = Collection[Tuple[int,int]]\n", "OptRange = Optional[Tuple[float,float]]\n", "\n", "class TabularModel(nn.Module):\n", " \"Basic model for tabular data\"\n", " \n", " def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], drops:Collection[float], \n", " emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, is_reg:bool=False, is_multi:bool=False):\n", " super().__init__()\n", " self.embeds = nn.ModuleList([get_embedding(ni, nf) for ni,nf in emb_szs])\n", " self.emb_drop = nn.Dropout(emb_drop)\n", " self.bn_cont = nn.BatchNorm1d(n_cont)\n", " n_emb = sum(e.embedding_dim for e in self.embeds)\n", " self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range\n", " if is_reg: final_act = None if y_range is None else nn.Sigmoid()\n", " else: final_act = nn.LogSoftmax() if is_multi else nn.Sigmoid()\n", " sizes = [n_emb + n_cont] + layers + [out_sz]\n", " actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [final_act]\n", " layers = []\n", " for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+drops,actns)):\n", " layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)\n", " self.layers = nn.Sequential(*layers)\n", " \n", " def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor:\n", " if self.n_emb != 0:\n", " x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]\n", " x = torch.cat(x, 1)\n", " x = self.emb_drop(x)\n", " if self.n_cont != 0:\n", " x_cont = self.bn_cont(x_cont)\n", " x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont\n", " x = self.layers(x)\n", " if self.y_range is not None: x = (self.y_range[1] - self.y_range[0]) * x + self.y_range[0]\n", " return x.squeeze()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars]\n", "emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]\n", "emb_szs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max_log_y = np.log(np.max(train_df['Sales']))\n", "y_range = torch.tensor([0, max_log_y*1.2], device=default_device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range, is_reg=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def exp_rmspe(pred:Tensor, targ:Tensor) -> Rank0Tensor:\n", " pred, targ = torch.exp(pred), torch.exp(targ)\n", " pct_var = (targ - pred)/targ\n", " return torch.sqrt((pct_var**2).mean())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn = Learner(data, model)\n", "learn.loss_fn = F.mse_loss\n", "learn.metrics = [exp_rmspe]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.lr_find()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.recorder.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with torch.no_grad():\n", " pct_var,cnt = 0.,0\n", " for x,y in learn.data.valid_dl:\n", " out = learn.model(*x)\n", " cnt += y.size(0)\n", " y, out = torch.exp(y), torch.exp(out)\n", " pct_var += ((y - out)/y).pow(2).sum()\n", "torch.sqrt(pct_var/cnt).item()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }