{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.datasets import load_boston\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import SGDRegressor as skSGDRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 1\n", "- scikit-learn loss = \"squared_loss\", penalty=\"l2\"/\"none\"\n", "- similar to sklearn.linear_model.LinearRegression" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def _loss(x, y, coef, intercept):\n", " p = np.dot(x, coef) + intercept \n", " return 0.5 * (p - y) * (p - y)\n", "\n", "def _grad(x, y, coef, intercept):\n", " p = np.dot(x, coef) + intercept\n", " # clip gradient (consistent with scikit-learn)\n", " dloss = np.clip(p - y, -1e12, 1e12)\n", " coef_grad = dloss * x\n", " intercept_grad = dloss\n", " return coef_grad, intercept_grad" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "class SGDRegressor():\n", " def __init__(self, penalty=\"l2\", alpha=0.0001, max_iter=1000, tol=1e-3,\n", " shuffle=True, random_state=0,\n", " eta0=0.01, power_t=0.25, n_iter_no_change=5):\n", " self.penalty = penalty\n", " self.alpha = alpha\n", " self.max_iter = max_iter\n", " self.tol = tol\n", " self.shuffle = shuffle\n", " self.random_state = random_state\n", " self.eta0 = eta0\n", " self.power_t = power_t\n", " self.n_iter_no_change = n_iter_no_change\n", "\n", " def fit(self, X, y):\n", " coef = np.zeros(X.shape[1])\n", " intercept = 0\n", " best_loss = np.inf\n", " no_improvement_count = 0\n", " t = 1\n", " rng = np.random.RandomState(self.random_state)\n", " for epoch in range(self.max_iter):\n", " # different from how data is shuffled in scikit-learn\n", " if self.shuffle:\n", " ind = rng.permutation(X.shape[0])\n", " X, y = X[ind], y[ind]\n", " sumloss = 0\n", " for i in range(X.shape[0]):\n", " sumloss += _loss(X[i], y[i], coef, intercept)\n", " eta = self.eta0 / np.power(t, self.power_t)\n", " coef_grad, intercept_grad = _grad(X[i], y[i], coef, intercept)\n", " if self.penalty == \"l2\":\n", " coef *= 1 - eta * self.alpha\n", " coef -= eta * coef_grad\n", " intercept -= eta * intercept_grad\n", " t += 1\n", " if sumloss > best_loss - self.tol * X.shape[0]:\n", " no_improvement_count += 1\n", " else:\n", " no_improvement_count = 0\n", " if no_improvement_count == self.n_iter_no_change:\n", " break\n", " if sumloss < best_loss:\n", " best_loss = sumloss\n", " self.coef_ = coef\n", " self.intercept_ = np.array([intercept])\n", " self.n_iter_ = epoch + 1\n", " return self\n", "\n", " def predict(self, X):\n", " y_pred = np.dot(X, self.coef_) + self.intercept_\n", " return y_pred" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# shuffle=False penalty=\"none\"\n", "X, y = load_boston(return_X_y=True)\n", "X = StandardScaler().fit_transform(X)\n", "clf1 = SGDRegressor(shuffle=False, penalty=\"none\").fit(X, y)\n", "clf2 = skSGDRegressor(shuffle=False, penalty=\"none\").fit(X, y)\n", "assert np.allclose(clf1.coef_, clf2.coef_)\n", "assert np.allclose(clf1.intercept_, clf2.intercept_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.allclose(pred1, pred2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# shuffle=False penalty=\"l2\"\n", "for alpha in [0.1, 1, 10]:\n", " X, y = load_boston(return_X_y=True)\n", " X = StandardScaler().fit_transform(X)\n", " clf1 = SGDRegressor(shuffle=False, alpha=alpha).fit(X, y)\n", " clf2 = skSGDRegressor(shuffle=False, alpha=alpha).fit(X, y)\n", " assert np.allclose(clf1.coef_, clf2.coef_)\n", " assert np.allclose(clf1.intercept_, clf2.intercept_)\n", " pred1 = clf1.predict(X)\n", " pred2 = clf2.predict(X)\n", " assert np.allclose(pred1, pred2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Implementation 2\n", "- scikit-learn loss = \"huber\", penalty=\"l2\"/\"none\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def _loss(x, y, coef, intercept, epsilon):\n", " p = np.dot(x, coef) + intercept\n", " r = p - y\n", " if np.abs(r) <= epsilon:\n", " return 0.5 * r * r\n", " else:\n", " return epsilon * (np.abs(r) - 0.5 * epsilon)\n", "\n", "def _grad(x, y, coef, intercept, epsilon):\n", " p = np.dot(x, coef) + intercept\n", " r = p - y\n", " if np.abs(r) <= epsilon:\n", " dloss = r\n", " elif r > epsilon:\n", " dloss = epsilon\n", " else:\n", " dloss = -epsilon\n", " dloss = np.clip(dloss, -1e12, 1e12)\n", " coef_grad = dloss * x\n", " intercept_grad = dloss\n", " return coef_grad, intercept_grad" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "class SGDRegressor():\n", " def __init__(self, penalty=\"l2\", alpha=0.0001, max_iter=1000, tol=1e-3,\n", " shuffle=True, epsilon=0.1, random_state=0,\n", " eta0=0.01, power_t=0.25, n_iter_no_change=5):\n", " self.penalty = penalty\n", " self.alpha = alpha\n", " self.max_iter = max_iter\n", " self.tol = tol\n", " self.shuffle = shuffle\n", " self.epsilon = epsilon\n", " self.random_state = random_state\n", " self.eta0 = eta0\n", " self.power_t = power_t\n", " self.n_iter_no_change = n_iter_no_change\n", "\n", " def fit(self, X, y):\n", " coef = np.zeros(X.shape[1])\n", " intercept = 0\n", " best_loss = np.inf\n", " no_improvement_count = 0\n", " t = 1\n", " rng = np.random.RandomState(self.random_state)\n", " for epoch in range(self.max_iter):\n", " # different from how data is shuffled in scikit-learn\n", " if self.shuffle:\n", " ind = rng.permutation(X.shape[0])\n", " X, y = X[ind], y[ind]\n", " sumloss = 0\n", " for i in range(X.shape[0]):\n", " sumloss += _loss(X[i], y[i], coef, intercept, self.epsilon)\n", " eta = self.eta0 / np.power(t, self.power_t)\n", " coef_grad, intercept_grad = _grad(X[i], y[i], coef, intercept, self.epsilon)\n", " if self.penalty == \"l2\":\n", " coef *= 1 - eta * self.alpha\n", " coef -= eta * coef_grad\n", " intercept -= eta * intercept_grad\n", " t += 1\n", " if sumloss > best_loss - self.tol * X.shape[0]:\n", " no_improvement_count += 1\n", " else:\n", " no_improvement_count = 0\n", " if no_improvement_count == self.n_iter_no_change:\n", " break\n", " if sumloss < best_loss:\n", " best_loss = sumloss\n", " self.coef_ = coef\n", " self.intercept_ = np.array([intercept])\n", " self.n_iter_ = epoch + 1\n", " return self\n", "\n", " def predict(self, X):\n", " y_pred = np.dot(X, self.coef_) + self.intercept_\n", " return y_pred" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# shuffle=False penalty=\"none\"\n", "X, y = load_boston(return_X_y=True)\n", "X = StandardScaler().fit_transform(X)\n", "clf1 = SGDRegressor(shuffle=False, penalty=\"none\").fit(X, y)\n", "clf2 = skSGDRegressor(loss=\"huber\", shuffle=False, penalty=\"none\").fit(X, y)\n", "assert np.allclose(clf1.coef_, clf2.coef_)\n", "assert np.allclose(clf1.intercept_, clf2.intercept_)\n", "pred1 = clf1.predict(X)\n", "pred2 = clf2.predict(X)\n", "assert np.allclose(pred1, pred2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# shuffle=False penalty=\"l2\"\n", "for alpha in [0.1, 1, 10]:\n", " X, y = load_boston(return_X_y=True)\n", " X = StandardScaler().fit_transform(X)\n", " clf1 = SGDRegressor(shuffle=False, alpha=alpha).fit(X, y)\n", " clf2 = skSGDRegressor(loss=\"huber\", shuffle=False, alpha=alpha).fit(X, y)\n", " assert np.allclose(clf1.coef_, clf2.coef_)\n", " assert np.allclose(clf1.intercept_, clf2.intercept_)\n", " pred1 = clf1.predict(X)\n", " pred2 = clf2.predict(X)\n", " assert np.allclose(pred1, pred2)" ] } ], "metadata": { "kernelspec": { "display_name": "dev", "language": "python", "name": "dev" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }