{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tabular example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.tabular import *  # Quick accesss to tabular functionality"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tabular data should be in a Pandas `DataFrame`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = untar_data(URLs.ADULT_SAMPLE)\n",
    "df = pd.read_csv(path/'adult.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['>=50k', '<50k'], dtype=object)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['salary'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function import\n",
    "from fastai.utils.mem import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 7812)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# other function teset\n",
    "gpu_with_max_free_mem()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test reduce_mem_usage(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>workclass</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education</th>\n",
       "      <th>education-num</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>capital-gain</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>hours-per-week</th>\n",
       "      <th>native-country</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>49</td>\n",
       "      <td>Private</td>\n",
       "      <td>101320</td>\n",
       "      <td>Assoc-acdm</td>\n",
       "      <td>12.0</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>1902</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;=50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44</td>\n",
       "      <td>Private</td>\n",
       "      <td>236746</td>\n",
       "      <td>Masters</td>\n",
       "      <td>14.0</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>10520</td>\n",
       "      <td>0</td>\n",
       "      <td>45</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;=50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>96185</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Unmarried</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>32</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>38</td>\n",
       "      <td>Self-emp-inc</td>\n",
       "      <td>112847</td>\n",
       "      <td>Prof-school</td>\n",
       "      <td>15.0</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Asian-Pac-Islander</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;=50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>82297</td>\n",
       "      <td>7th-8th</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age          workclass  fnlwgt     education  education-num  \\\n",
       "0   49            Private  101320    Assoc-acdm           12.0   \n",
       "1   44            Private  236746       Masters           14.0   \n",
       "2   38            Private   96185       HS-grad            NaN   \n",
       "3   38       Self-emp-inc  112847   Prof-school           15.0   \n",
       "4   42   Self-emp-not-inc   82297       7th-8th            NaN   \n",
       "\n",
       "        marital-status        occupation    relationship                 race  \\\n",
       "0   Married-civ-spouse               NaN            Wife                White   \n",
       "1             Divorced   Exec-managerial   Not-in-family                White   \n",
       "2             Divorced               NaN       Unmarried                Black   \n",
       "3   Married-civ-spouse    Prof-specialty         Husband   Asian-Pac-Islander   \n",
       "4   Married-civ-spouse     Other-service            Wife                Black   \n",
       "\n",
       "       sex  capital-gain  capital-loss  hours-per-week  native-country salary  \n",
       "0   Female             0          1902              40   United-States  >=50k  \n",
       "1     Male         10520             0              45   United-States  >=50k  \n",
       "2   Female             0             0              32   United-States   <50k  \n",
       "3     Male             0             0              40   United-States  >=50k  \n",
       "4   Female             0             0              50   United-States   <50k  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dep_var = 'salary'\n",
    "cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']\n",
    "cont_names = ['age', 'fnlwgt', 'education-num']\n",
    "procs = [FillMissing, Categorify, Normalize]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)\n",
    "                           .split_by_idx(list(range(800,1000)))\n",
    "                           .label_from_df(cols=dep_var)\n",
    "                           .add_test(test)\n",
    "                           .databunch())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>education-num_na</th>\n",
       "      <th>age</th>\n",
       "      <th>fnlwgt</th>\n",
       "      <th>education-num</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-1.1425</td>\n",
       "      <td>-0.9280</td>\n",
       "      <td>1.1422</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-0.5561</td>\n",
       "      <td>0.7244</td>\n",
       "      <td>-0.4224</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-1.5090</td>\n",
       "      <td>-0.1673</td>\n",
       "      <td>-0.0312</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Other-relative</td>\n",
       "      <td>Amer-Indian-Eskimo</td>\n",
       "      <td>False</td>\n",
       "      <td>1.2763</td>\n",
       "      <td>-0.8370</td>\n",
       "      <td>-0.4224</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Local-gov</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Transport-moving</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>0.2502</td>\n",
       "      <td>-1.3617</td>\n",
       "      <td>1.1422</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>Some-college</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-0.8493</td>\n",
       "      <td>-0.3286</td>\n",
       "      <td>-0.0312</td>\n",
       "      <td>&gt;=50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>11th</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Own-child</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-1.5090</td>\n",
       "      <td>0.8521</td>\n",
       "      <td>-1.2046</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>7th-8th</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Tech-support</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-0.2629</td>\n",
       "      <td>0.0550</td>\n",
       "      <td>-2.3781</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Transport-moving</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-0.8493</td>\n",
       "      <td>-0.3286</td>\n",
       "      <td>-0.4224</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Private</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>False</td>\n",
       "      <td>-0.7760</td>\n",
       "      <td>1.3159</td>\n",
       "      <td>1.1422</td>\n",
       "      <td>&lt;50k</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data.show_batch(rows=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Total time: 00:02 <p><table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.357122</td>\n",
       "      <td>0.381649</td>\n",
       "      <td>0.790000</td>\n",
       "      <td>00:02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn = tabular_learner(data, layers=[200,100], metrics=accuracy)\n",
    "learn.fit(1, 1e-2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "row = df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Category >=50k, tensor(1), tensor([0.3581, 0.6419]))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "learn.predict(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}