{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tabular example"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastai.tabular import * # Quick accesss to tabular functionality"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tabular data should be in a Pandas `DataFrame`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"path = untar_data(URLs.ADULT_SAMPLE)\n",
"df = pd.read_csv(path/'adult.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['>=50k', '<50k'], dtype=object)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['salary'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" workclass | \n",
" fnlwgt | \n",
" education | \n",
" education-num | \n",
" marital-status | \n",
" occupation | \n",
" relationship | \n",
" race | \n",
" sex | \n",
" capital-gain | \n",
" capital-loss | \n",
" hours-per-week | \n",
" native-country | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 49 | \n",
" Private | \n",
" 101320 | \n",
" Assoc-acdm | \n",
" 12.0 | \n",
" Married-civ-spouse | \n",
" NaN | \n",
" Wife | \n",
" White | \n",
" Female | \n",
" 0 | \n",
" 1902 | \n",
" 40 | \n",
" United-States | \n",
" >=50k | \n",
"
\n",
" \n",
" 1 | \n",
" 44 | \n",
" Private | \n",
" 236746 | \n",
" Masters | \n",
" 14.0 | \n",
" Divorced | \n",
" Exec-managerial | \n",
" Not-in-family | \n",
" White | \n",
" Male | \n",
" 10520 | \n",
" 0 | \n",
" 45 | \n",
" United-States | \n",
" >=50k | \n",
"
\n",
" \n",
" 2 | \n",
" 38 | \n",
" Private | \n",
" 96185 | \n",
" HS-grad | \n",
" NaN | \n",
" Divorced | \n",
" NaN | \n",
" Unmarried | \n",
" Black | \n",
" Female | \n",
" 0 | \n",
" 0 | \n",
" 32 | \n",
" United-States | \n",
" <50k | \n",
"
\n",
" \n",
" 3 | \n",
" 38 | \n",
" Self-emp-inc | \n",
" 112847 | \n",
" Prof-school | \n",
" 15.0 | \n",
" Married-civ-spouse | \n",
" Prof-specialty | \n",
" Husband | \n",
" Asian-Pac-Islander | \n",
" Male | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" >=50k | \n",
"
\n",
" \n",
" 4 | \n",
" 42 | \n",
" Self-emp-not-inc | \n",
" 82297 | \n",
" 7th-8th | \n",
" NaN | \n",
" Married-civ-spouse | \n",
" Other-service | \n",
" Wife | \n",
" Black | \n",
" Female | \n",
" 0 | \n",
" 0 | \n",
" 50 | \n",
" United-States | \n",
" <50k | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age workclass fnlwgt education education-num \\\n",
"0 49 Private 101320 Assoc-acdm 12.0 \n",
"1 44 Private 236746 Masters 14.0 \n",
"2 38 Private 96185 HS-grad NaN \n",
"3 38 Self-emp-inc 112847 Prof-school 15.0 \n",
"4 42 Self-emp-not-inc 82297 7th-8th NaN \n",
"\n",
" marital-status occupation relationship race \\\n",
"0 Married-civ-spouse NaN Wife White \n",
"1 Divorced Exec-managerial Not-in-family White \n",
"2 Divorced NaN Unmarried Black \n",
"3 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander \n",
"4 Married-civ-spouse Other-service Wife Black \n",
"\n",
" sex capital-gain capital-loss hours-per-week native-country salary \n",
"0 Female 0 1902 40 United-States >=50k \n",
"1 Male 10520 0 45 United-States >=50k \n",
"2 Female 0 0 32 United-States <50k \n",
"3 Male 0 0 40 United-States >=50k \n",
"4 Female 0 0 50 United-States <50k "
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dep_var = 'salary'\n",
"cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']\n",
"cont_names = ['age', 'fnlwgt', 'education-num']\n",
"procs = [FillMissing, Categorify, Normalize]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)\n",
" .split_by_idx(list(range(800,1000)))\n",
" .label_from_df(cols=dep_var)\n",
" .add_test(test)\n",
" .databunch())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" workclass | \n",
" education | \n",
" marital-status | \n",
" occupation | \n",
" relationship | \n",
" race | \n",
" education-num_na | \n",
" age | \n",
" fnlwgt | \n",
" education-num | \n",
" target | \n",
"
\n",
" \n",
" Private | \n",
" Some-college | \n",
" Never-married | \n",
" Handlers-cleaners | \n",
" Own-child | \n",
" White | \n",
" False | \n",
" -1.5090 | \n",
" 0.2360 | \n",
" -0.0312 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" Some-college | \n",
" Divorced | \n",
" Exec-managerial | \n",
" Not-in-family | \n",
" White | \n",
" False | \n",
" -0.4828 | \n",
" 0.0802 | \n",
" -0.0312 | \n",
" <50k | \n",
"
\n",
" \n",
" Local-gov | \n",
" HS-grad | \n",
" Separated | \n",
" Other-service | \n",
" Not-in-family | \n",
" Black | \n",
" False | \n",
" 0.2502 | \n",
" 0.3442 | \n",
" -0.4224 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" Assoc-voc | \n",
" Married-civ-spouse | \n",
" Craft-repair | \n",
" Husband | \n",
" White | \n",
" False | \n",
" 0.9098 | \n",
" -0.8595 | \n",
" 0.3599 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" HS-grad | \n",
" Never-married | \n",
" Sales | \n",
" Not-in-family | \n",
" White | \n",
" False | \n",
" -1.0692 | \n",
" -0.6139 | \n",
" -0.4224 | \n",
" <50k | \n",
"
\n",
" \n",
" Self-emp-not-inc | \n",
" Bachelors | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" White | \n",
" False | \n",
" -0.3362 | \n",
" 0.2229 | \n",
" 1.1422 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" Bachelors | \n",
" Never-married | \n",
" Farming-fishing | \n",
" Own-child | \n",
" White | \n",
" False | \n",
" -0.9959 | \n",
" -0.8271 | \n",
" 1.1422 | \n",
" >=50k | \n",
"
\n",
" \n",
" State-gov | \n",
" Bachelors | \n",
" Divorced | \n",
" Adm-clerical | \n",
" Unmarried | \n",
" White | \n",
" False | \n",
" 0.6899 | \n",
" -1.3176 | \n",
" 1.1422 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" HS-grad | \n",
" Married-civ-spouse | \n",
" Adm-clerical | \n",
" Wife | \n",
" Black | \n",
" False | \n",
" -1.0692 | \n",
" -1.3076 | \n",
" -0.4224 | \n",
" <50k | \n",
"
\n",
" \n",
" Private | \n",
" Masters | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" Asian-Pac-Islander | \n",
" False | \n",
" -0.3362 | \n",
" -0.8518 | \n",
" 1.5334 | \n",
" >=50k | \n",
"
\n",
"
\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data.show_batch(rows=10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Total time: 00:04 \n",
" \n",
" epoch | \n",
" train_loss | \n",
" valid_loss | \n",
" accuracy | \n",
"
\n",
" \n",
" 1 | \n",
" 0.358096 | \n",
" 0.370009 | \n",
" 0.830000 | \n",
"
\n",
"
\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"learn = tabular_learner(data, layers=[200,100], metrics=accuracy)\n",
"learn.fit(1, 1e-2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"row = df.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(Category <50k, tensor(0), tensor([0.5142, 0.4858]))"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"learn.predict(row)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}