{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from scipy.special import xlogy\n", "from sklearn.datasets import load_breast_cancer, load_iris\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import AdaBoostClassifier as skAdaBoostClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 1\n", "- SAMME" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class AdaBoostClassifier():\n", " def __init__(self, n_estimators=50, random_state=0):\n", " self.n_estimators = n_estimators\n", " self.random_state = random_state\n", "\n", " def fit(self, X, y):\n", " self.classes_, y_train = np.unique(y, return_inverse=True)\n", " self.n_classes_ = len(self.classes_)\n", " sample_weight = np.full(X.shape[0], 1 / X.shape[0])\n", " self.estimators_ = []\n", " self.estimator_weights_ = np.zeros(self.n_estimators)\n", " self.estimator_errors_ = np.ones(self.n_estimators)\n", " MAX_INT = np.iinfo(np.int32).max\n", " rng = np.random.RandomState(self.random_state)\n", " for i in range(self.n_estimators):\n", " est = DecisionTreeClassifier(max_depth=1,\n", " random_state=rng.randint(MAX_INT))\n", " est.fit(X, y_train, sample_weight=sample_weight)\n", " y_predict = est.predict(X)\n", " incorrect = y_predict != y_train\n", " estimator_error = np.average(incorrect, weights=sample_weight)\n", " estimator_weight = (np.log((1 - estimator_error) / estimator_error) +\n", " np.log(self.n_classes_ - 1))\n", " sample_weight *= np.exp(estimator_weight * incorrect)\n", " sample_weight /= np.sum(sample_weight)\n", " self.estimators_.append(est)\n", " self.estimator_errors_[i] = estimator_error\n", " self.estimator_weights_[i] = estimator_weight\n", " return self\n", "\n", " def decision_function(self, X):\n", " pred = np.zeros((X.shape[0], self.n_classes_))\n", " for i in range(self.n_estimators):\n", " pred[np.arange(X.shape[0]), self.estimators_[i].predict(X)] += self.estimator_weights_[i]\n", " pred /= np.sum(self.estimator_weights_)\n", " if self.n_classes_ == 2:\n", " return pred[:, 1] - pred[:, 0]\n", " return pred\n", "\n", " def predict(self, X):\n", " scores = self.decision_function(X)\n", " if len(scores.shape) == 1:\n", " indices = (scores > 0).astype(int)\n", " else:\n", " indices = np.argmax(scores, axis=1)\n", " return self.classes_[indices]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "X, y = load_breast_cancer(return_X_y=True)\n", "clf1 = AdaBoostClassifier(random_state=0).fit(X, y)\n", "clf2 = skAdaBoostClassifier(algorithm=\"SAMME\", random_state=0).fit(X, y)\n", "assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)\n", "assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.array_equal(pred1, pred2)\n", "prob1 = clf1.decision_function(X)\n", "prob2 = clf2.decision_function(X)\n", "assert np.allclose(prob1, prob2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X, y = load_iris(return_X_y=True)\n", "clf1 = AdaBoostClassifier(random_state=0).fit(X, y)\n", "clf2 = skAdaBoostClassifier(algorithm=\"SAMME\", random_state=0).fit(X, y)\n", "assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)\n", "assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.array_equal(pred1, pred2)\n", "prob1 = clf1.decision_function(X)\n", "prob2 = clf2.decision_function(X)\n", "assert np.allclose(prob1, prob2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 2\n", "- SAMME.R" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "class AdaBoostClassifier():\n", " def __init__(self, n_estimators=50, random_state=0):\n", " self.n_estimators = n_estimators\n", " self.random_state = random_state\n", "\n", " def fit(self, X, y):\n", " self.classes_, y_train = np.unique(y, return_inverse=True)\n", " self.n_classes_ = len(self.classes_)\n", " sample_weight = np.full(X.shape[0], 1 / X.shape[0])\n", " self.estimators_ = []\n", " self.estimator_weights_ = np.zeros(self.n_estimators)\n", " self.estimator_errors_ = np.ones(self.n_estimators)\n", " MAX_INT = np.iinfo(np.int32).max\n", " rng = np.random.RandomState(self.random_state)\n", " for i in range(self.n_estimators):\n", " est = DecisionTreeClassifier(max_depth=1,\n", " random_state=rng.randint(MAX_INT))\n", " est.fit(X, y_train, sample_weight=sample_weight)\n", " y_predict = est.predict(X)\n", " estimator_error = np.average(y_predict != y_train, weights=sample_weight)\n", " y_predict_proba = est.predict_proba(X)\n", " np.clip(y_predict_proba, np.finfo(y_predict_proba.dtype).eps, None, y_predict_proba)\n", " y_coding = np.full((y_train.shape[0], self.n_classes_), -1 / (self.n_classes_ - 1))\n", " y_coding[np.arange(y_train.shape[0]), y_train] = 1\n", " sample_weight *= np.exp(-1 * ((self.n_classes_ - 1) / self.n_classes_)\n", " * xlogy(y_coding, y_predict_proba).sum(axis=1))\n", " sample_weight /= np.sum(sample_weight)\n", " self.estimators_.append(est)\n", " self.estimator_errors_[i] = estimator_error\n", " self.estimator_weights_[i] = 1\n", " return self\n", "\n", " def decision_function(self, X):\n", " pred = np.zeros((X.shape[0], self.n_classes_))\n", " for i in range(self.n_estimators):\n", " y_predict_proba = self.estimators_[i].predict_proba(X)\n", " np.clip(y_predict_proba, np.finfo(y_predict_proba.dtype).eps, None, y_predict_proba)\n", " log_proba = np.log(y_predict_proba)\n", " pred += ((self.n_classes_ - 1)\n", " * (log_proba - (1 / self.n_classes_) * log_proba.sum(axis=1)[:, np.newaxis]))\n", " pred /= np.sum(self.estimator_weights_)\n", " if self.n_classes_ == 2:\n", " return pred[:, 1] - pred[:, 0]\n", " return pred\n", "\n", " def predict(self, X):\n", " scores = self.decision_function(X)\n", " if len(scores.shape) == 1:\n", " indices = (scores > 0).astype(int)\n", " else:\n", " indices = np.argmax(scores, axis=1)\n", " return self.classes_[indices]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "X, y = load_breast_cancer(return_X_y=True)\n", "clf1 = AdaBoostClassifier(random_state=0).fit(X, y)\n", "clf2 = skAdaBoostClassifier(algorithm=\"SAMME.R\", random_state=0).fit(X, y)\n", "assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)\n", "assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.array_equal(pred1, pred2)\n", "prob1 = clf1.decision_function(X)\n", "prob2 = clf2.decision_function(X)\n", "assert np.allclose(prob1, prob2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X, y = load_iris(return_X_y=True)\n", "clf1 = AdaBoostClassifier(n_estimators=50, random_state=0).fit(X, y)\n", "clf2 = skAdaBoostClassifier(n_estimators=50, algorithm=\"SAMME.R\", random_state=0).fit(X, y)\n", "assert np.allclose(clf1.estimator_errors_, clf2.estimator_errors_)\n", "assert np.allclose(clf1.estimator_weights_, clf2.estimator_weights_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.array_equal(pred1, pred2)\n", "prob1 = clf1.decision_function(X)\n", "prob2 = clf2.decision_function(X)\n", "assert np.allclose(prob1, prob2)" ] } ], "metadata": { "kernelspec": { "display_name": "dev", "language": "python", "name": "dev" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }