{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring regularization for linear regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Goal\n", "\n", "The goal of this lab is to explore the effect of regularization on the coefficients and accuracy of linear regression models for a toy (Ames housing) dataset." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "np.random.seed(999)\n", "\n", "from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.datasets import load_wine, load_boston\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import r2_score\n", "\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "%config InlineBackend.figure_format = 'retina'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data, drop columns with missing values, one-hot encode categoricals" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Id | \n", "MSSubClass | \n", "MSZoning | \n", "LotFrontage | \n", "LotArea | \n", "Street | \n", "Alley | \n", "LotShape | \n", "LandContour | \n", "Utilities | \n", "... | \n", "PoolArea | \n", "PoolQC | \n", "Fence | \n", "MiscFeature | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SaleType | \n", "SaleCondition | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "60 | \n", "RL | \n", "65.0 | \n", "8450 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2008 | \n", "WD | \n", "Normal | \n", "208500 | \n", "
1 | \n", "2 | \n", "20 | \n", "RL | \n", "80.0 | \n", "9600 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2007 | \n", "WD | \n", "Normal | \n", "181500 | \n", "
2 rows × 81 columns
\n", "\n", " | OverallQual | \n", "3SsnPorch | \n", "MoSold | \n", "WoodDeckSF | \n", "FullBath | \n", "LowQualFinSF | \n", "MSSubClass | \n", "BsmtFinSF1 | \n", "BsmtUnfSF | \n", "LotArea | \n", "... | \n", "LandContour_Bnk | \n", "LandContour_HLS | \n", "LandContour_Low | \n", "LandContour_Lvl | \n", "RoofStyle_Flat | \n", "RoofStyle_Gable | \n", "RoofStyle_Gambrel | \n", "RoofStyle_Hip | \n", "RoofStyle_Mansard | \n", "RoofStyle_Shed | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1441 | \n", "-0.071836 | \n", "-0.116339 | \n", "-0.489110 | \n", "0.437009 | \n", "-1.026041 | \n", "-0.120242 | \n", "1.492282 | \n", "0.555685 | \n", "-0.942327 | \n", "-0.610435 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
533 | \n", "-3.688413 | \n", "-0.116339 | \n", "-1.969111 | \n", "-0.752176 | \n", "-1.026041 | \n", "-0.120242 | \n", "-0.872563 | \n", "-0.973018 | \n", "-1.284176 | \n", "-0.552908 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
1078 | \n", "-0.071836 | \n", "-0.116339 | \n", "-0.489110 | \n", "0.365179 | \n", "-1.026041 | \n", "-0.120242 | \n", "1.492282 | \n", "0.478921 | \n", "-0.863090 | \n", "-0.609533 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 rows × 216 columns
\n", "\n", "lasso = Lasso(alpha=lmbda, tol=.1)\n", "lasso.fit(X_train, y_train)\n", "\n", "
\n", "sum(np.abs(lasso.coef_) < 1e-5) # how many close to 0?\n", "\n", "
\n", "ridge = Ridge(alpha=lmbda)\n", "ridge.fit(X_train, y_train)\n", "\n", "