{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Exercise 01\n",
    "The goal is to find the best set of hyper-parameters which maximize the\n",
    "performance on a training set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "# This line is currently required to import HistGradientBoostingClassifier\n",
    "from sklearn.experimental import enable_hist_gradient_boosting\n",
    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
    "\n",
    "from scipy.stats import expon, uniform\n",
    "from scipy.stats import randint\n",
    "\n",
    "df = pd.read_csv(\"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
    "# Or use the local copy:\n",
    "# df = pd.read_csv('../datasets/adult-census.csv')\n",
    "\n",
    "target_name = \"class\"\n",
    "target = df[target_name].to_numpy()\n",
    "data = df.drop(columns=target_name)\n",
    "\n",
    "df_train, df_test, target_train, target_test = train_test_split(\n",
    "    data, target, random_state=42\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You should:\n",
    "- create a preprocessor using an `OrdinalEncoder`\n",
    "- use a `HistGradientBoostingClassifier` to make predictions\n",
    "- use a `RandomizedSearchCV` to find the best set of hyper-parameters by\n",
    "  tuning the following parameters: `learning_rate`, `l2_regularization`,\n",
    "  `max_leaf_nodes`, and `min_samples_leaf`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ordinal_encoding_columns = ['workclass', 'education', 'marital-status',\n",
    "                            'occupation', 'relationship', 'race',\n",
    "                            'native-country', 'sex']\n",
    "\n",
    "categories = [data[column].unique()\n",
    "              for column in data[ordinal_encoding_columns]]\n",
    "\n",
    "preprocessor = ColumnTransformer(\n",
    "    [('ordinal-encoder', OrdinalEncoder(categories=categories),\n",
    "      ordinal_encoding_columns)],\n",
    "    remainder='passthrough', sparse_threshold=0\n",
    ")\n",
    "\n",
    "model = Pipeline(\n",
    "    [('preprocessor', preprocessor),\n",
    "     ('gbrt', HistGradientBoostingClassifier(max_iter=50))]\n",
    ")\n",
    "param_distributions = {\n",
    "    'gbrt__learning_rate': expon(loc=0.001, scale=0.5),\n",
    "    'gbrt__l2_regularization': uniform(loc=0, scale=0.5),\n",
    "    'gbrt__max_leaf_nodes': randint(5, 30),\n",
    "    'gbrt__min_samples_leaf': randint(5, 30)\n",
    "}\n",
    "model_grid_search = RandomizedSearchCV(\n",
    "    model, param_distributions=param_distributions, n_iter=10, n_jobs=4\n",
    ")\n",
    "model_grid_search.fit(df_train, target_train)\n",
    "print(\n",
    "    f\"The accuracy score using a {model_grid_search.__class__.__name__} is \"\n",
    "    f\"{model_grid_search.score(df_test, target_test):.2f}\"\n",
    ")\n",
    "print(f\"The best set of parameters is: {model_grid_search.best_params_}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_results = pd.DataFrame(model_grid_search.cv_results_)\n",
    "columns = (['mean_test_score', 'std_test_score'] +\n",
    "           [col for col in df_results.columns if 'param_' in col])\n",
    "df_results.sort_values(by='mean_test_score', ascending=False)[\n",
    "    columns\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "python_scripts//py:percent,notebooks//ipynb"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}