{ "cells": [ { "cell_type": "markdown", "metadata": { "_uuid": "cdd204371f9f83d15616dab1a94370e1eba0d9b9", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "1c927176-236a-411f-9d70-50c3feafcd97", "isComponent": false, "name": "", "parents": [] }, "tags": [] }, "source": [ "# XGBoost Example\n", "\n", "Exploring the use of XGBoost and its integration with Scikit-Learn.\n", "\n", "Some useful links:\n", "* [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/index.html)\n", "* [Parameters](https://xgboost.readthedocs.io/en/latest/parameter.html)\n", "* [Python package](https://xgboost.readthedocs.io/en/latest/python/python_intro.html)\n", "* [Python examples](https://github.com/dmlc/xgboost/tree/master/demo/guide-python)\n", "* [scikit-learn examples](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py)\n", "* [Diabetes dataset](http://scikit-learn.org/stable/datasets/index.html#diabetes-dataset)\n", "* [Breast cancer dataset](http://scikit-learn.org/stable/datasets/index.html#breast-cancer-wisconsin-diagnostic-database)\n", "\n", "Objective is to demonstrate:\n", "* regression ✓\n", "* binary classification ✓\n", "* multiclass classification ✓\n", "* cross-validation ✓\n", "* hyperparameter searching ✓\n", "\n", "### Note\n", "This notebook is adapted from https://www.kaggle.com/code/stuarthallows/using-xgboost-with-scikit-learn/notebook" ] }, { "cell_type": "markdown", "metadata": { "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "inherit", "id": "605ad26a-ef2f-4ec8-90d1-66cd110e78b9", "isComponent": false, "name": "", "parents": [] }, "tags": [] }, "source": [ "### Required Python Packages\n", "- `numpy`\n", "- `scikit-learn`\n", "- `scipy`\n", "- `xgboost`\n", "\n", "Run the following cell to install the packages." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "inherit", "id": "7f467622-294c-4628-9e6e-526c2c3c3851", "isComponent": false, "name": "", "parents": [] } }, "outputs": [], "source": [ "#\n", "# Required Packages\n", "# Run this cell to install required packages.\n", "#\n", "%pip install \"numpy>=1.19\" \"pandas>=1.1\" \"scikit-learn>=0.22.2\" \"scipy>=1.7\" \"xgboost>=1.6\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "4ab228c84471891201204c1c70c0e4535ff76db9", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "0b9140b9-e1c3-41bb-9e2a-db6a4e20469f", "isComponent": true, "name": "Import Packages", "parents": [] }, "tags": [] }, "outputs": [], "source": [ "import numpy as np\n", "import xgboost as xgb\n", "from scipy.stats import randint, uniform\n", "from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine\n", "from sklearn.metrics import accuracy_score, auc, confusion_matrix, mean_squared_error\n", "from sklearn.model_selection import (\n", " GridSearchCV,\n", " KFold,\n", " RandomizedSearchCV,\n", " cross_val_score,\n", " train_test_split,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "63f609040e124bf1ef3b7853daab97c6be268312", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "a4589d15-2d90-4495-bd48-078de4953373", "isComponent": true, "name": "Define functions", "parents": [ { "id": "0b9140b9-e1c3-41bb-9e2a-db6a4e20469f", "name": "Import Packages" } ] }, "tags": [] }, "outputs": [], "source": [ "def display_scores(scores):\n", " print(\"Scores: {0}\\nMean: {1:.3f}\\nStd: {2:.3f}\".format(scores, np.mean(scores), np.std(scores)))\n", "\n", "\n", "def report_best_scores(results, n_top=3):\n", " for i in range(1, n_top + 1):\n", " candidates = np.flatnonzero(results[\"rank_test_score\"] == i)\n", " for candidate in candidates:\n", " print(\"Model with rank: {0}\".format(i))\n", " print(\n", " \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", " results[\"mean_test_score\"][candidate], results[\"std_test_score\"][candidate]\n", " )\n", " )\n", " print(\"Parameters: {0}\".format(results[\"params\"][candidate]))\n", " print(\"\")" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "37811e5665f546205a98f9d7f77375fce4a1d9b9", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "1c42b075-0c58-4fa1-b2bd-2a13aded9d0c", "isComponent": false, "name": "", "parents": [] } }, "source": ["## Regression"] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "e35bb62d5ea4622413443ff43d578e7b2893c309", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "a740b745-e62e-4830-8f03-5e6b5a1df375", "isComponent": true, "name": "Regression task", "parents": [ { "id": "a4589d15-2d90-4495-bd48-078de4953373", "name": "Define functions" } ] }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[23:05:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "0.2401475171547707\n", "XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,\n", " colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,\n", " early_stopping_rounds=None, enable_categorical=False,\n", " eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',\n", " importance_type=None, interaction_constraints='',\n", " learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,\n", " max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,\n", " missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,\n", " num_parallel_tree=1, objective='reg:linear', predictor='auto',\n", " random_state=42, reg_alpha=0, ...)\n" ] } ], "source": [ "diabetes = load_diabetes()\n", "\n", "X = diabetes.data\n", "y = diabetes.target\n", "\n", "xgb_model = xgb.XGBRegressor(objective=\"reg:linear\", random_state=42)\n", "\n", "xgb_model.fit(X, y)\n", "\n", "y_pred = xgb_model.predict(X)\n", "\n", "mse = mean_squared_error(y, y_pred)\n", "\n", "print(np.sqrt(mse))\n", "print(xgb_model)" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "3e675a3449f6d7f6fabd8b634195fde22c2cd670", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "dd15be85-f70c-430f-a537-c22c732f4cf6", "isComponent": false, "name": "", "parents": [] } }, "source": ["## Binary classification"] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "21eee83d9068275fac549b80394d399b97edd2c5", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "97ffa820-42dc-4017-849c-932af34eec11", "isComponent": true, "name": "Binary Classification", "parents": [ { "id": "a4589d15-2d90-4495-bd48-078de4953373", "name": "Define functions" } ] }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": ["[[212 0]\n", " [ 0 357]]\n"] } ], "source": [ "cancer = load_breast_cancer()\n", "\n", "X = cancer.data\n", "y = cancer.target\n", "\n", "xgb_model = xgb.XGBClassifier(objective=\"binary:logistic\", random_state=42)\n", "xgb_model.fit(X, y)\n", "\n", "y_pred = xgb_model.predict(X)\n", "\n", "print(confusion_matrix(y, y_pred))" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "1bb24203c37935875a7628974d8748e4386aae80", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "f296ca50-cd9c-423c-9653-1f3284d54c58", "isComponent": false, "name": "", "parents": [] } }, "source": ["## Multiclass classification"] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "84869c7bcb30f512af78b94ea768aa022264038d", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "6503eaf0-4556-4e52-a606-ad1ab6d7720a", "isComponent": true, "name": "Multiclass Classification", "parents": [ { "id": "a4589d15-2d90-4495-bd48-078de4953373", "name": "Define functions" } ] }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": ["[[59 0 0]\n", " [ 0 71 0]\n", " [ 0 0 48]]\n"] } ], "source": [ "wine = load_wine()\n", "\n", "X = wine.data\n", "y = wine.target\n", "\n", "xgb_model = xgb.XGBClassifier(objective=\"multi:softprob\", random_state=42)\n", "xgb_model.fit(X, y)\n", "\n", "y_pred = xgb_model.predict(X)\n", "\n", "print(confusion_matrix(y, y_pred))" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "e79c504b5e909694f049100483d957f8f3b83bb7", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "f174f373-5fbb-4f8f-bbf3-857e03e1c6e8", "isComponent": false, "name": "", "parents": [] } }, "source": [ "## Cross validation\n", "\n", "Cross-validation using `KFold`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "c55ecf8bcfbad2d6f2cfa90ea6f3e594323128d4", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "7b578e2b-015e-463a-9f45-ce9952cf1415", "isComponent": true, "name": "Cross Validation", "parents": [ { "id": "a4589d15-2d90-4495-bd48-078de4953373", "name": "Define functions" } ] }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "Scores: [63.94113133 61.42459265 67.48347385 69.49735119 59.90352074]\n", "Mean: 64.450\n", "Std: 3.599\n", "[23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "[23:05:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.\n", "Scores: [62.80101886 65.82933114 62.19849188 66.40701402 67.29879575]\n", "Mean: 64.907\n", "Std: 2.029\n" ] } ], "source": [ "diabetes = load_diabetes()\n", "\n", "X = diabetes.data\n", "y = diabetes.target\n", "\n", "kfold = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "scores = []\n", "\n", "for train_index, test_index in kfold.split(X):\n", " X_train, X_test = X[train_index], X[test_index]\n", " y_train, y_test = y[train_index], y[test_index]\n", "\n", " xgb_model = xgb.XGBRegressor(objective=\"reg:linear\")\n", " xgb_model.fit(X_train, y_train)\n", "\n", " y_pred = xgb_model.predict(X_test)\n", "\n", " scores.append(mean_squared_error(y_test, y_pred))\n", "\n", "display_scores(np.sqrt(scores))\n", "\n", "xgb_model = xgb.XGBRegressor(objective=\"reg:linear\", random_state=42)\n", "\n", "scores = cross_val_score(xgb_model, X, y, scoring=\"neg_mean_squared_error\", cv=5)\n", "\n", "display_scores(np.sqrt(-scores))" ] }, { "cell_type": "markdown", "metadata": { "_uuid": "352a19638053ea9e7e5993c324169e08d87c7434", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "e3406e02-dd43-4c62-b902-38ebaaecea41", "isComponent": false, "name": "", "parents": [] } }, "source": ["Cross-validation using `cross_val_score`"] }, { "cell_type": "markdown", "metadata": { "_uuid": "b3830b4750c56d1cdd2e11529998dd36a05b2809", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "f2689880-86bb-447f-9232-0fa09b592695", "isComponent": false, "name": "", "parents": [] } }, "source": ["## Hyperparameter searching"] }, { "cell_type": "code", "execution_count": null, "metadata": { "_uuid": "170c2d75670a537b2c06a59b4d31db380c811bb0", "canvas": { "comments": [], "componentType": "CodeCell", "copiedOriginId": null, "diskcache": false, "headerColor": "none", "id": "a55c0490-f918-4e16-b937-7294e76a8504", "isComponent": true, "name": "Hyperparameter searching", "parents": [ { "id": "a4589d15-2d90-4495-bd48-078de4953373", "name": "Define functions" } ] }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 200 candidates, totalling 600 fits\n", "Model with rank: 1\n", "Mean validation score: 0.464 (std: 0.009)\n", "Parameters: {'colsample_bytree': 0.7516959613604889, 'gamma': 0.09614450940433539, 'learning_rate': 0.042260584879943656, 'max_depth': 2, 'n_estimators': 117, 'subsample': 0.7114361356127834}\n", "\n" ] } ], "source": [ "diabetes = load_diabetes()\n", "\n", "X = diabetes.data\n", "y = diabetes.target\n", "\n", "xgb_model = xgb.XGBRegressor()\n", "\n", "params = {\n", " \"colsample_bytree\": uniform(0.7, 0.3),\n", " \"gamma\": uniform(0, 0.5),\n", " \"learning_rate\": uniform(0.03, 0.3), # default 0.1\n", " \"max_depth\": randint(2, 6), # default 3\n", " \"n_estimators\": randint(100, 150), # default 100\n", " \"subsample\": uniform(0.6, 0.4),\n", "}\n", "\n", "search = RandomizedSearchCV(\n", " xgb_model,\n", " param_distributions=params,\n", " random_state=42,\n", " n_iter=200,\n", " cv=3,\n", " verbose=1,\n", " n_jobs=1,\n", " return_train_score=True,\n", ")\n", "\n", "search.fit(X, y)\n", "\n", "report_best_scores(search.cv_results_, 1)" ] } ], "metadata": { "canvas": { "colorPalette": [ "inherit", "inherit", "inherit", "inherit", "inherit", "inherit", "inherit", "inherit", "inherit", "inherit" ], "parameters": [], "version": "1.0" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 4 }