{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# HuberRegressor vs Ridge on dataset with strong outliers\n\nFit Ridge and HuberRegressor on a dataset with outliers.\n\nThe example shows that the predictions in ridge are strongly influenced\nby the outliers present in the dataset. The Huber regressor is less\ninfluenced by the outliers since the model uses the linear loss for these.\nAs the parameter epsilon is increased for the Huber regressor, the decision\nfunction approaches that of the ridge.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import make_regression\nfrom sklearn.linear_model import HuberRegressor, Ridge\n\n# Generate toy data.\nrng = np.random.RandomState(0)\nX, y = make_regression(\n n_samples=20, n_features=1, random_state=0, noise=4.0, bias=100.0\n)\n\n# Add four strong outliers to the dataset.\nX_outliers = rng.normal(0, 0.5, size=(4, 1))\ny_outliers = rng.normal(0, 2.0, size=4)\nX_outliers[:2, :] += X.max() + X.mean() / 4.0\nX_outliers[2:, :] += X.min() - X.mean() / 4.0\ny_outliers[:2] += y.min() - y.mean() / 4.0\ny_outliers[2:] += y.max() + y.mean() / 4.0\nX = np.vstack((X, X_outliers))\ny = np.concatenate((y, y_outliers))\nplt.plot(X, y, \"b.\")\n\n# Fit the huber regressor over a series of epsilon values.\ncolors = [\"r-\", \"b-\", \"y-\", \"m-\"]\n\nx = np.linspace(X.min(), X.max(), 7)\nepsilon_values = [1, 1.5, 1.75, 1.9]\nfor k, epsilon in enumerate(epsilon_values):\n huber = HuberRegressor(alpha=0.0, epsilon=epsilon)\n huber.fit(X, y)\n coef_ = huber.coef_ * x + huber.intercept_\n plt.plot(x, coef_, colors[k], label=\"huber loss, %s\" % epsilon)\n\n# Fit a ridge regressor to compare it to huber regressor.\nridge = Ridge(alpha=0.0, random_state=0)\nridge.fit(X, y)\ncoef_ridge = ridge.coef_\ncoef_ = ridge.coef_ * x + ridge.intercept_\nplt.plot(x, coef_, \"g-\", label=\"ridge regression\")\n\nplt.title(\"Comparison of HuberRegressor vs Ridge\")\nplt.xlabel(\"X\")\nplt.ylabel(\"y\")\nplt.legend(loc=0)\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }