{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Exercise 01\n",
    "The goal is to write an exhaustive search to find the best parameters\n",
    "combination maximizing the model performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "# This line is currently required to import HistGradientBoostingClassifier\n",
    "from sklearn.experimental import enable_hist_gradient_boosting\n",
    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
    "\n",
    "from scipy.stats import expon, uniform\n",
    "from scipy.stats import randint\n",
    "\n",
    "df = pd.read_csv(\n",
    "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
    "# Or use the local copy:\n",
    "# df = pd.read_csv('../datasets/adult-census.csv')\n",
    "\n",
    "target_name = \"class\"\n",
    "target = df[target_name].to_numpy()\n",
    "data = df.drop(columns=target_name)\n",
    "\n",
    "df_train, df_test, target_train, target_test = train_test_split(\n",
    "    data, target, random_state=42)\n",
    "\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "\n",
    "categorical_columns = [\n",
    "    'workclass', 'education', 'marital-status', 'occupation',\n",
    "    'relationship', 'race', 'native-country', 'sex']\n",
    "\n",
    "categories = [data[column].unique()\n",
    "              for column in data[categorical_columns]]\n",
    "\n",
    "categorical_preprocessor = OrdinalEncoder(categories=categories)\n",
    "\n",
    "preprocessor = ColumnTransformer(\n",
    "    [('cat-preprocessor', categorical_preprocessor, categorical_columns)],\n",
    "    remainder='passthrough', sparse_threshold=0)\n",
    "\n",
    "from sklearn.experimental import enable_hist_gradient_boosting\n",
    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
    "from sklearn.pipeline import make_pipeline\n",
    "\n",
    "model = make_pipeline(\n",
    "    preprocessor, HistGradientBoostingClassifier(random_state=42))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO: write your solution here\n",
    "\n",
    "Use the previously defined model (called `model`) and using two nested `for`\n",
    "loops, make a search of the best combinations of the `learning_rate` and\n",
    "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n",
    "the model by setting the parameters. The evaluation of the model should be\n",
    "performed using `cross_val_score`. We can propose to define the following\n",
    "parameters search:\n",
    "- `learning_rate` for the values 0.01, 0.1, and 1;\n",
    "- `max_leaf_nodes` for the values 5, 25, 45."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "learning_rate = [0.01, 0.1, 1, 10]\n",
    "max_leaf_nodes = [5, 25, 45]\n",
    "\n",
    "best_score = 0\n",
    "best_params = {}\n",
    "for lr in learning_rate:\n",
    "    for mln in max_leaf_nodes:\n",
    "        model.set_params(\n",
    "            histgradientboostingclassifier__learning_rate=lr,\n",
    "            histgradientboostingclassifier__max_leaf_nodes=mln\n",
    "        )\n",
    "        scores = cross_val_score(model, df_train, target_train, cv=3)\n",
    "        if scores.mean() > best_score:\n",
    "            best_score = scores.mean()\n",
    "            best_params = {'learning-rate': lr, 'max leaf nodes': mln}\n",
    "print(f\"The best accuracy obtained is {best_score:.3f}\")\n",
    "print(f\"The best parameters found are:\\n {best_params}\")"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "python_scripts//py:percent,notebooks//ipynb"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}