{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring regularization for logistic regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Goal\n", "\n", "The goal of this lab is to explore the effect of regularization on the coefficients and accuracy of logistic regression models for a toy (wine) dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "np.random.seed(999)\n", "\n", "from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.datasets import load_wine\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, precision_score, recall_score, log_loss, mean_absolute_error\n", "\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "%config InlineBackend.figure_format = 'retina'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pandas.api.types import is_numeric_dtype\n", "def normalize(X): \n", " for colname in X.columns:\n", " if is_numeric_dtype(X[colname]):\n", " u = np.mean(X[colname])\n", " s = np.std(X[colname])\n", " X[colname] = (X[colname] - u) / s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data, create 2-class problem" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "130 records for classes {0,1} from 178 records\n" ] }, { "data": { "text/html": [ "
\n", " | alcohol | \n", "malic_acid | \n", "ash | \n", "alcalinity_of_ash | \n", "magnesium | \n", "total_phenols | \n", "flavanoids | \n", "nonflavanoid_phenols | \n", "proanthocyanins | \n", "color_intensity | \n", "hue | \n", "od280/od315_of_diluted_wines | \n", "proline | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "14.23 | \n", "1.71 | \n", "2.43 | \n", "15.6 | \n", "127.0 | \n", "2.80 | \n", "3.06 | \n", "0.28 | \n", "2.29 | \n", "5.64 | \n", "1.04 | \n", "3.92 | \n", "1065.0 | \n", "
1 | \n", "13.20 | \n", "1.78 | \n", "2.14 | \n", "11.2 | \n", "100.0 | \n", "2.65 | \n", "2.76 | \n", "0.26 | \n", "1.28 | \n", "4.38 | \n", "1.05 | \n", "3.40 | \n", "1050.0 | \n", "
\n", "lg = LogisticRegression(penalty='none', solver='lbfgs', max_iter=200)\n", "lg.fit(X_train, y_train)\n", "\n", "
\n", "lg_beta = lg.coef_[0]\n", "sum(np.abs(lg_beta) < 1e-5) # how many close to 0?\n", "\n", "