{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 01\n", "The goal is to find the best set of hyper-parameters which maximize the\n", "performance on a training set." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "# This line is currently required to import HistGradientBoostingClassifier\n", "from sklearn.experimental import enable_hist_gradient_boosting\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", "\n", "from scipy.stats import expon, uniform\n", "from scipy.stats import randint\n", "\n", "df = pd.read_csv(\"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", "# Or use the local copy:\n", "# df = pd.read_csv('../datasets/adult-census.csv')\n", "\n", "target_name = \"class\"\n", "target = df[target_name].to_numpy()\n", "data = df.drop(columns=target_name)\n", "\n", "df_train, df_test, target_train, target_test = train_test_split(\n", " data, target, random_state=42\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You should:\n", "- create a preprocessor using an `OrdinalEncoder`\n", "- use a `HistGradientBoostingClassifier` to make predictions\n", "- use a `RandomizedSearchCV` to find the best set of hyper-parameters by\n", " tuning the following parameters: `learning_rate`, `l2_regularization`,\n", " `max_leaf_nodes`, and `min_samples_leaf`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ordinal_encoding_columns = ['workclass', 'education', 'marital-status',\n", " 'occupation', 'relationship', 'race',\n", " 'native-country', 'sex']\n", "\n", "categories = [data[column].unique()\n", " for column in data[ordinal_encoding_columns]]\n", "\n", "preprocessor = ColumnTransformer(\n", " [('ordinal-encoder', OrdinalEncoder(categories=categories),\n", " ordinal_encoding_columns)],\n", " remainder='passthrough', sparse_threshold=0\n", ")\n", "\n", "model = Pipeline(\n", " [('preprocessor', preprocessor),\n", " ('gbrt', HistGradientBoostingClassifier(max_iter=50))]\n", ")\n", "param_distributions = {\n", " 'gbrt__learning_rate': expon(loc=0.001, scale=0.5),\n", " 'gbrt__l2_regularization': uniform(loc=0, scale=0.5),\n", " 'gbrt__max_leaf_nodes': randint(5, 30),\n", " 'gbrt__min_samples_leaf': randint(5, 30)\n", "}\n", "model_grid_search = RandomizedSearchCV(\n", " model, param_distributions=param_distributions, n_iter=10, n_jobs=4\n", ")\n", "model_grid_search.fit(df_train, target_train)\n", "print(\n", " f\"The accuracy score using a {model_grid_search.__class__.__name__} is \"\n", " f\"{model_grid_search.score(df_test, target_test):.2f}\"\n", ")\n", "print(f\"The best set of parameters is: {model_grid_search.best_params_}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_results = pd.DataFrame(model_grid_search.cv_results_)\n", "columns = (['mean_test_score', 'std_test_score'] +\n", " [col for col in df_results.columns if 'param_' in col])\n", "df_results.sort_values(by='mean_test_score', ascending=False)[\n", " columns\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "jupytext": { "formats": "python_scripts//py:percent,notebooks//ipynb" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }