{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.datasets import load_boston\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import AdaBoostRegressor as skAdaBoostRegressor" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class AdaBoostRegressor():\n", " def __init__(self, n_estimators=50, random_state=0):\n", " self.n_estimators = n_estimators\n", " self.random_state = 0\n", "\n", " def fit(self, X, y):\n", " sample_weight = np.full(X.shape[0], 1 / X.shape[0])\n", " self.estimators_ = []\n", " self.estimator_weights_ = np.zeros(self.n_estimators)\n", " self.estimator_errors_ = np.ones(self.n_estimators)\n", " MAX_INT = np.iinfo(np.int32).max\n", " rng = np.random.RandomState(self.random_state)\n", " for i in range(self.n_estimators):\n", " est = DecisionTreeRegressor(max_depth=3,\n", " random_state=rng.randint(MAX_INT))\n", " cdf = np.cumsum(sample_weight)\n", " cdf /= cdf[-1]\n", " uniform_samples = rng.random_sample(X.shape[0])\n", " bootstrap_idx = cdf.searchsorted(uniform_samples, side='right')\n", " est.fit(X[bootstrap_idx], y[bootstrap_idx])\n", " y_predict = est.predict(X)\n", " error_vect = np.abs(y_predict - y)\n", " error_vect /= error_vect.max()\n", " estimator_error = (sample_weight * error_vect).sum()\n", " beta = estimator_error / (1 - estimator_error)\n", " estimator_weight = np.log(1 / beta)\n", " sample_weight *= np.power(beta, 1 - error_vect)\n", " sample_weight /= np.sum(sample_weight)\n", " self.estimators_.append(est)\n", " self.estimator_errors_[i] = estimator_error\n", " self.estimator_weights_[i] = estimator_weight\n", " return self\n", "\n", " def predict(self, X):\n", " predictions = np.array([est.predict(X) for est in self.estimators_]).T\n", " sorted_idx = np.argsort(predictions, axis=1)\n", " weight_cdf = np.cumsum(self.estimator_weights_[sorted_idx], axis=1)\n", " median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]\n", " median_idx = median_or_above.argmax(axis=1)\n", " median_estimators = sorted_idx[np.arange(X.shape[0]), median_idx]\n", " return predictions[np.arange(X.shape[0]), median_estimators]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "X, y = load_boston(return_X_y=True)\n", "clf1 = AdaBoostRegressor(random_state=0).fit(X, y)\n", "clf2 = skAdaBoostRegressor(random_state=0).fit(X, y)\n", "assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)\n", "assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.array_equal(pred1, pred2)" ] } ], "metadata": { "kernelspec": { "display_name": "dev", "language": "python", "name": "dev" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }