{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Gradient boosting\n", "\n", "This is a minimal but workable implementation of Gradient Boosting Decision Trees with scikit-learn\n", "\n", "Read more in this [blog post](http://nicolas-hug.com/blog/gradient_boosting_descent)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from scipy.special import expit as sigmoid\n", "from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.datasets import make_regression\n", "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class BaseGradientBoosting(BaseEstimator):\n", " def __init__(self, n_iter, learning_rate, loss):\n", " self.loss = loss\n", " self.learning_rate = learning_rate\n", " self.n_iter = n_iter\n", "\n", " def fit(self, X, y):\n", " rng = np.random.RandomState(seed=0)\n", "\n", " n_samples = X.shape[0]\n", " y_pred_train = rng.normal(size=n_samples)\n", "\n", " self.predictors = list()\n", "\n", " for m in range(self.n_iter): # Gradient Descent\n", "\n", " negative_gradient = -self.loss.compute_gradients(y, y_pred_train)\n", " new_predictor = DecisionTreeRegressor(max_depth=3)\n", " new_predictor.fit(X, y=self.learning_rate * negative_gradient)\n", " y_pred_train += new_predictor.predict(X)\n", "\n", " self.predictors.append(new_predictor) # save for predict()\n", "\n", " def predict(self, X):\n", " return sum(predictor.predict(X) for predictor in self.predictors)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "r squared on training data: 0.997\n", "r squared on test data: 0.937\n" ] } ], "source": [ "# GBDT for regression\n", "\n", "class LeastSquaresLoss:\n", " def compute_gradients(self, y_true, y_pred):\n", " return -2 * (y_true - y_pred)\n", " \n", "\n", "class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):\n", " \n", " def __init__(self, n_iter=100, learning_rate=.1):\n", " super().__init__(n_iter, learning_rate, loss=LeastSquaresLoss())\n", "\n", "\n", "X, y = make_regression(n_samples=1000, random_state=0)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", "gb = GradientBoostingRegressor()\n", "gb.fit(X_train, y_train)\n", "\n", "print('r squared on training data: {:.3f}'.format(gb.score(X_train, y_train)))\n", "print('r squared on test data: {:.3f}'.format(gb.score(X_test, y_test)))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy on training data: 0.981\n", "accuracy on test data: 0.956\n" ] } ], "source": [ "# GBDT for classification\n", "\n", "class BinaryCrossEntropy:\n", " def compute_gradients(self, y_true, y_pred):\n", " return sigmoid(y_pred) - y_true\n", " def raw_predictions_to_proba(self, raw_predictions):\n", " return sigmoid(raw_predictions)\n", "\n", "\n", "class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):\n", "\n", " def __init__(self, n_iter=100, learning_rate=.1):\n", " super().__init__(n_iter, learning_rate, loss=BinaryCrossEntropy())\n", "\n", " def predict(self, X):\n", " raw_predictions = super().predict(X)\n", " proba_positive_class = self.loss.raw_predictions_to_proba(raw_predictions)\n", " return proba_positive_class > .5\n", "\n", "X, y = make_classification(n_samples=1000, random_state=0)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", "gb = GradientBoostingClassifier()\n", "gb.fit(X_train, y_train)\n", "\n", "print('accuracy on training data: {:.3f}'.format(gb.score(X_train, y_train)))\n", "print('accuracy on test data: {:.3f}'.format(gb.score(X_test, y_test)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }