{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Chapter 5 - Resampling Methods"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"- [Lab 5.3.1 The Validation Set Approach](#lab-5.3.1)\n",
"- [Lab 5.3.2 Leave-One-Out Cross-Validation](#lab-5.3.2)\n",
"- [Lab 5.3.3 k-Fold Cross-Validation](#lab-5.3.3)\n",
"- [Lab 5.3.4 The Bootstrap](#lab-5.3.4)"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### Imports and Configurations"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# Standard imports\n",
"import warnings\n",
"\n",
"# Use rpy2 for loading R datasets\n",
"from rpy2.robjects.packages import importr\n",
"from rpy2.robjects.packages import data as rdata\n",
"from rpy2.robjects import pandas2ri\n",
"\n",
"# Math and data processing\n",
"import numpy as np\n",
"import scipy as sp\n",
"import pandas as pd\n",
"\n",
"# scikit-learn\n",
"from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score\n",
"from sklearn.utils import resample\n",
"from sklearn.preprocessing import scale, PolynomialFeatures\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"# Visulization\n",
"from IPython.display import display\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"mpl.style.use('ggplot')"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### Lab 5.3.1 The Validation Set Approach"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" mpg | \n",
" cylinders | \n",
" displacement | \n",
" horsepower | \n",
" weight | \n",
" acceleration | \n",
" year | \n",
" origin | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 18.0 | \n",
" 8.0 | \n",
" 307.0 | \n",
" 130.0 | \n",
" 3504.0 | \n",
" 12.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" chevrolet chevelle malibu | \n",
"
\n",
" \n",
" 2 | \n",
" 15.0 | \n",
" 8.0 | \n",
" 350.0 | \n",
" 165.0 | \n",
" 3693.0 | \n",
" 11.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" buick skylark 320 | \n",
"
\n",
" \n",
" 3 | \n",
" 18.0 | \n",
" 8.0 | \n",
" 318.0 | \n",
" 150.0 | \n",
" 3436.0 | \n",
" 11.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" plymouth satellite | \n",
"
\n",
" \n",
" 4 | \n",
" 16.0 | \n",
" 8.0 | \n",
" 304.0 | \n",
" 150.0 | \n",
" 3433.0 | \n",
" 12.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" amc rebel sst | \n",
"
\n",
" \n",
" 5 | \n",
" 17.0 | \n",
" 8.0 | \n",
" 302.0 | \n",
" 140.0 | \n",
" 3449.0 | \n",
" 10.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" ford torino | \n",
"
\n",
" \n",
" 6 | \n",
" 15.0 | \n",
" 8.0 | \n",
" 429.0 | \n",
" 198.0 | \n",
" 4341.0 | \n",
" 10.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" ford galaxie 500 | \n",
"
\n",
" \n",
" 7 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 454.0 | \n",
" 220.0 | \n",
" 4354.0 | \n",
" 9.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" chevrolet impala | \n",
"
\n",
" \n",
" 8 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 440.0 | \n",
" 215.0 | \n",
" 4312.0 | \n",
" 8.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" plymouth fury iii | \n",
"
\n",
" \n",
" 9 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 455.0 | \n",
" 225.0 | \n",
" 4425.0 | \n",
" 10.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" pontiac catalina | \n",
"
\n",
" \n",
" 10 | \n",
" 15.0 | \n",
" 8.0 | \n",
" 390.0 | \n",
" 190.0 | \n",
" 3850.0 | \n",
" 8.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" amc ambassador dpl | \n",
"
\n",
" \n",
" 11 | \n",
" 15.0 | \n",
" 8.0 | \n",
" 383.0 | \n",
" 170.0 | \n",
" 3563.0 | \n",
" 10.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" dodge challenger se | \n",
"
\n",
" \n",
" 12 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 340.0 | \n",
" 160.0 | \n",
" 3609.0 | \n",
" 8.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" plymouth 'cuda 340 | \n",
"
\n",
" \n",
" 13 | \n",
" 15.0 | \n",
" 8.0 | \n",
" 400.0 | \n",
" 150.0 | \n",
" 3761.0 | \n",
" 9.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" chevrolet monte carlo | \n",
"
\n",
" \n",
" 14 | \n",
" 14.0 | \n",
" 8.0 | \n",
" 455.0 | \n",
" 225.0 | \n",
" 3086.0 | \n",
" 10.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" buick estate wagon (sw) | \n",
"
\n",
" \n",
" 15 | \n",
" 24.0 | \n",
" 4.0 | \n",
" 113.0 | \n",
" 95.0 | \n",
" 2372.0 | \n",
" 15.0 | \n",
" 70.0 | \n",
" 3.0 | \n",
" toyota corona mark ii | \n",
"
\n",
" \n",
" 16 | \n",
" 22.0 | \n",
" 6.0 | \n",
" 198.0 | \n",
" 95.0 | \n",
" 2833.0 | \n",
" 15.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" plymouth duster | \n",
"
\n",
" \n",
" 17 | \n",
" 18.0 | \n",
" 6.0 | \n",
" 199.0 | \n",
" 97.0 | \n",
" 2774.0 | \n",
" 15.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" amc hornet | \n",
"
\n",
" \n",
" 18 | \n",
" 21.0 | \n",
" 6.0 | \n",
" 200.0 | \n",
" 85.0 | \n",
" 2587.0 | \n",
" 16.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" ford maverick | \n",
"
\n",
" \n",
" 19 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 97.0 | \n",
" 88.0 | \n",
" 2130.0 | \n",
" 14.5 | \n",
" 70.0 | \n",
" 3.0 | \n",
" datsun pl510 | \n",
"
\n",
" \n",
" 20 | \n",
" 26.0 | \n",
" 4.0 | \n",
" 97.0 | \n",
" 46.0 | \n",
" 1835.0 | \n",
" 20.5 | \n",
" 70.0 | \n",
" 2.0 | \n",
" volkswagen 1131 deluxe sedan | \n",
"
\n",
" \n",
" 21 | \n",
" 25.0 | \n",
" 4.0 | \n",
" 110.0 | \n",
" 87.0 | \n",
" 2672.0 | \n",
" 17.5 | \n",
" 70.0 | \n",
" 2.0 | \n",
" peugeot 504 | \n",
"
\n",
" \n",
" 22 | \n",
" 24.0 | \n",
" 4.0 | \n",
" 107.0 | \n",
" 90.0 | \n",
" 2430.0 | \n",
" 14.5 | \n",
" 70.0 | \n",
" 2.0 | \n",
" audi 100 ls | \n",
"
\n",
" \n",
" 23 | \n",
" 25.0 | \n",
" 4.0 | \n",
" 104.0 | \n",
" 95.0 | \n",
" 2375.0 | \n",
" 17.5 | \n",
" 70.0 | \n",
" 2.0 | \n",
" saab 99e | \n",
"
\n",
" \n",
" 24 | \n",
" 26.0 | \n",
" 4.0 | \n",
" 121.0 | \n",
" 113.0 | \n",
" 2234.0 | \n",
" 12.5 | \n",
" 70.0 | \n",
" 2.0 | \n",
" bmw 2002 | \n",
"
\n",
" \n",
" 25 | \n",
" 21.0 | \n",
" 6.0 | \n",
" 199.0 | \n",
" 90.0 | \n",
" 2648.0 | \n",
" 15.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" amc gremlin | \n",
"
\n",
" \n",
" 26 | \n",
" 10.0 | \n",
" 8.0 | \n",
" 360.0 | \n",
" 215.0 | \n",
" 4615.0 | \n",
" 14.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" ford f250 | \n",
"
\n",
" \n",
" 27 | \n",
" 10.0 | \n",
" 8.0 | \n",
" 307.0 | \n",
" 200.0 | \n",
" 4376.0 | \n",
" 15.0 | \n",
" 70.0 | \n",
" 1.0 | \n",
" chevy c20 | \n",
"
\n",
" \n",
" 28 | \n",
" 11.0 | \n",
" 8.0 | \n",
" 318.0 | \n",
" 210.0 | \n",
" 4382.0 | \n",
" 13.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" dodge d200 | \n",
"
\n",
" \n",
" 29 | \n",
" 9.0 | \n",
" 8.0 | \n",
" 304.0 | \n",
" 193.0 | \n",
" 4732.0 | \n",
" 18.5 | \n",
" 70.0 | \n",
" 1.0 | \n",
" hi 1200d | \n",
"
\n",
" \n",
" 30 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 97.0 | \n",
" 88.0 | \n",
" 2130.0 | \n",
" 14.5 | \n",
" 71.0 | \n",
" 3.0 | \n",
" datsun pl510 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 368 | \n",
" 28.0 | \n",
" 4.0 | \n",
" 112.0 | \n",
" 88.0 | \n",
" 2605.0 | \n",
" 19.6 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chevrolet cavalier | \n",
"
\n",
" \n",
" 369 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 112.0 | \n",
" 88.0 | \n",
" 2640.0 | \n",
" 18.6 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chevrolet cavalier wagon | \n",
"
\n",
" \n",
" 370 | \n",
" 34.0 | \n",
" 4.0 | \n",
" 112.0 | \n",
" 88.0 | \n",
" 2395.0 | \n",
" 18.0 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chevrolet cavalier 2-door | \n",
"
\n",
" \n",
" 371 | \n",
" 31.0 | \n",
" 4.0 | \n",
" 112.0 | \n",
" 85.0 | \n",
" 2575.0 | \n",
" 16.2 | \n",
" 82.0 | \n",
" 1.0 | \n",
" pontiac j2000 se hatchback | \n",
"
\n",
" \n",
" 372 | \n",
" 29.0 | \n",
" 4.0 | \n",
" 135.0 | \n",
" 84.0 | \n",
" 2525.0 | \n",
" 16.0 | \n",
" 82.0 | \n",
" 1.0 | \n",
" dodge aries se | \n",
"
\n",
" \n",
" 373 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 151.0 | \n",
" 90.0 | \n",
" 2735.0 | \n",
" 18.0 | \n",
" 82.0 | \n",
" 1.0 | \n",
" pontiac phoenix | \n",
"
\n",
" \n",
" 374 | \n",
" 24.0 | \n",
" 4.0 | \n",
" 140.0 | \n",
" 92.0 | \n",
" 2865.0 | \n",
" 16.4 | \n",
" 82.0 | \n",
" 1.0 | \n",
" ford fairmont futura | \n",
"
\n",
" \n",
" 375 | \n",
" 36.0 | \n",
" 4.0 | \n",
" 105.0 | \n",
" 74.0 | \n",
" 1980.0 | \n",
" 15.3 | \n",
" 82.0 | \n",
" 2.0 | \n",
" volkswagen rabbit l | \n",
"
\n",
" \n",
" 376 | \n",
" 37.0 | \n",
" 4.0 | \n",
" 91.0 | \n",
" 68.0 | \n",
" 2025.0 | \n",
" 18.2 | \n",
" 82.0 | \n",
" 3.0 | \n",
" mazda glc custom l | \n",
"
\n",
" \n",
" 377 | \n",
" 31.0 | \n",
" 4.0 | \n",
" 91.0 | \n",
" 68.0 | \n",
" 1970.0 | \n",
" 17.6 | \n",
" 82.0 | \n",
" 3.0 | \n",
" mazda glc custom | \n",
"
\n",
" \n",
" 378 | \n",
" 38.0 | \n",
" 4.0 | \n",
" 105.0 | \n",
" 63.0 | \n",
" 2125.0 | \n",
" 14.7 | \n",
" 82.0 | \n",
" 1.0 | \n",
" plymouth horizon miser | \n",
"
\n",
" \n",
" 379 | \n",
" 36.0 | \n",
" 4.0 | \n",
" 98.0 | \n",
" 70.0 | \n",
" 2125.0 | \n",
" 17.3 | \n",
" 82.0 | \n",
" 1.0 | \n",
" mercury lynx l | \n",
"
\n",
" \n",
" 380 | \n",
" 36.0 | \n",
" 4.0 | \n",
" 120.0 | \n",
" 88.0 | \n",
" 2160.0 | \n",
" 14.5 | \n",
" 82.0 | \n",
" 3.0 | \n",
" nissan stanza xe | \n",
"
\n",
" \n",
" 381 | \n",
" 36.0 | \n",
" 4.0 | \n",
" 107.0 | \n",
" 75.0 | \n",
" 2205.0 | \n",
" 14.5 | \n",
" 82.0 | \n",
" 3.0 | \n",
" honda accord | \n",
"
\n",
" \n",
" 382 | \n",
" 34.0 | \n",
" 4.0 | \n",
" 108.0 | \n",
" 70.0 | \n",
" 2245.0 | \n",
" 16.9 | \n",
" 82.0 | \n",
" 3.0 | \n",
" toyota corolla | \n",
"
\n",
" \n",
" 383 | \n",
" 38.0 | \n",
" 4.0 | \n",
" 91.0 | \n",
" 67.0 | \n",
" 1965.0 | \n",
" 15.0 | \n",
" 82.0 | \n",
" 3.0 | \n",
" honda civic | \n",
"
\n",
" \n",
" 384 | \n",
" 32.0 | \n",
" 4.0 | \n",
" 91.0 | \n",
" 67.0 | \n",
" 1965.0 | \n",
" 15.7 | \n",
" 82.0 | \n",
" 3.0 | \n",
" honda civic (auto) | \n",
"
\n",
" \n",
" 385 | \n",
" 38.0 | \n",
" 4.0 | \n",
" 91.0 | \n",
" 67.0 | \n",
" 1995.0 | \n",
" 16.2 | \n",
" 82.0 | \n",
" 3.0 | \n",
" datsun 310 gx | \n",
"
\n",
" \n",
" 386 | \n",
" 25.0 | \n",
" 6.0 | \n",
" 181.0 | \n",
" 110.0 | \n",
" 2945.0 | \n",
" 16.4 | \n",
" 82.0 | \n",
" 1.0 | \n",
" buick century limited | \n",
"
\n",
" \n",
" 387 | \n",
" 38.0 | \n",
" 6.0 | \n",
" 262.0 | \n",
" 85.0 | \n",
" 3015.0 | \n",
" 17.0 | \n",
" 82.0 | \n",
" 1.0 | \n",
" oldsmobile cutlass ciera (diesel) | \n",
"
\n",
" \n",
" 388 | \n",
" 26.0 | \n",
" 4.0 | \n",
" 156.0 | \n",
" 92.0 | \n",
" 2585.0 | \n",
" 14.5 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chrysler lebaron medallion | \n",
"
\n",
" \n",
" 389 | \n",
" 22.0 | \n",
" 6.0 | \n",
" 232.0 | \n",
" 112.0 | \n",
" 2835.0 | \n",
" 14.7 | \n",
" 82.0 | \n",
" 1.0 | \n",
" ford granada l | \n",
"
\n",
" \n",
" 390 | \n",
" 32.0 | \n",
" 4.0 | \n",
" 144.0 | \n",
" 96.0 | \n",
" 2665.0 | \n",
" 13.9 | \n",
" 82.0 | \n",
" 3.0 | \n",
" toyota celica gt | \n",
"
\n",
" \n",
" 391 | \n",
" 36.0 | \n",
" 4.0 | \n",
" 135.0 | \n",
" 84.0 | \n",
" 2370.0 | \n",
" 13.0 | \n",
" 82.0 | \n",
" 1.0 | \n",
" dodge charger 2.2 | \n",
"
\n",
" \n",
" 392 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 151.0 | \n",
" 90.0 | \n",
" 2950.0 | \n",
" 17.3 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chevrolet camaro | \n",
"
\n",
" \n",
" 393 | \n",
" 27.0 | \n",
" 4.0 | \n",
" 140.0 | \n",
" 86.0 | \n",
" 2790.0 | \n",
" 15.6 | \n",
" 82.0 | \n",
" 1.0 | \n",
" ford mustang gl | \n",
"
\n",
" \n",
" 394 | \n",
" 44.0 | \n",
" 4.0 | \n",
" 97.0 | \n",
" 52.0 | \n",
" 2130.0 | \n",
" 24.6 | \n",
" 82.0 | \n",
" 2.0 | \n",
" vw pickup | \n",
"
\n",
" \n",
" 395 | \n",
" 32.0 | \n",
" 4.0 | \n",
" 135.0 | \n",
" 84.0 | \n",
" 2295.0 | \n",
" 11.6 | \n",
" 82.0 | \n",
" 1.0 | \n",
" dodge rampage | \n",
"
\n",
" \n",
" 396 | \n",
" 28.0 | \n",
" 4.0 | \n",
" 120.0 | \n",
" 79.0 | \n",
" 2625.0 | \n",
" 18.6 | \n",
" 82.0 | \n",
" 1.0 | \n",
" ford ranger | \n",
"
\n",
" \n",
" 397 | \n",
" 31.0 | \n",
" 4.0 | \n",
" 119.0 | \n",
" 82.0 | \n",
" 2720.0 | \n",
" 19.4 | \n",
" 82.0 | \n",
" 1.0 | \n",
" chevy s-10 | \n",
"
\n",
" \n",
"
\n",
"
392 rows × 9 columns
\n",
"
"
],
"text/plain": [
" mpg cylinders displacement horsepower weight acceleration year \\\n",
"1 18.0 8.0 307.0 130.0 3504.0 12.0 70.0 \n",
"2 15.0 8.0 350.0 165.0 3693.0 11.5 70.0 \n",
"3 18.0 8.0 318.0 150.0 3436.0 11.0 70.0 \n",
"4 16.0 8.0 304.0 150.0 3433.0 12.0 70.0 \n",
"5 17.0 8.0 302.0 140.0 3449.0 10.5 70.0 \n",
"6 15.0 8.0 429.0 198.0 4341.0 10.0 70.0 \n",
"7 14.0 8.0 454.0 220.0 4354.0 9.0 70.0 \n",
"8 14.0 8.0 440.0 215.0 4312.0 8.5 70.0 \n",
"9 14.0 8.0 455.0 225.0 4425.0 10.0 70.0 \n",
"10 15.0 8.0 390.0 190.0 3850.0 8.5 70.0 \n",
"11 15.0 8.0 383.0 170.0 3563.0 10.0 70.0 \n",
"12 14.0 8.0 340.0 160.0 3609.0 8.0 70.0 \n",
"13 15.0 8.0 400.0 150.0 3761.0 9.5 70.0 \n",
"14 14.0 8.0 455.0 225.0 3086.0 10.0 70.0 \n",
"15 24.0 4.0 113.0 95.0 2372.0 15.0 70.0 \n",
"16 22.0 6.0 198.0 95.0 2833.0 15.5 70.0 \n",
"17 18.0 6.0 199.0 97.0 2774.0 15.5 70.0 \n",
"18 21.0 6.0 200.0 85.0 2587.0 16.0 70.0 \n",
"19 27.0 4.0 97.0 88.0 2130.0 14.5 70.0 \n",
"20 26.0 4.0 97.0 46.0 1835.0 20.5 70.0 \n",
"21 25.0 4.0 110.0 87.0 2672.0 17.5 70.0 \n",
"22 24.0 4.0 107.0 90.0 2430.0 14.5 70.0 \n",
"23 25.0 4.0 104.0 95.0 2375.0 17.5 70.0 \n",
"24 26.0 4.0 121.0 113.0 2234.0 12.5 70.0 \n",
"25 21.0 6.0 199.0 90.0 2648.0 15.0 70.0 \n",
"26 10.0 8.0 360.0 215.0 4615.0 14.0 70.0 \n",
"27 10.0 8.0 307.0 200.0 4376.0 15.0 70.0 \n",
"28 11.0 8.0 318.0 210.0 4382.0 13.5 70.0 \n",
"29 9.0 8.0 304.0 193.0 4732.0 18.5 70.0 \n",
"30 27.0 4.0 97.0 88.0 2130.0 14.5 71.0 \n",
".. ... ... ... ... ... ... ... \n",
"368 28.0 4.0 112.0 88.0 2605.0 19.6 82.0 \n",
"369 27.0 4.0 112.0 88.0 2640.0 18.6 82.0 \n",
"370 34.0 4.0 112.0 88.0 2395.0 18.0 82.0 \n",
"371 31.0 4.0 112.0 85.0 2575.0 16.2 82.0 \n",
"372 29.0 4.0 135.0 84.0 2525.0 16.0 82.0 \n",
"373 27.0 4.0 151.0 90.0 2735.0 18.0 82.0 \n",
"374 24.0 4.0 140.0 92.0 2865.0 16.4 82.0 \n",
"375 36.0 4.0 105.0 74.0 1980.0 15.3 82.0 \n",
"376 37.0 4.0 91.0 68.0 2025.0 18.2 82.0 \n",
"377 31.0 4.0 91.0 68.0 1970.0 17.6 82.0 \n",
"378 38.0 4.0 105.0 63.0 2125.0 14.7 82.0 \n",
"379 36.0 4.0 98.0 70.0 2125.0 17.3 82.0 \n",
"380 36.0 4.0 120.0 88.0 2160.0 14.5 82.0 \n",
"381 36.0 4.0 107.0 75.0 2205.0 14.5 82.0 \n",
"382 34.0 4.0 108.0 70.0 2245.0 16.9 82.0 \n",
"383 38.0 4.0 91.0 67.0 1965.0 15.0 82.0 \n",
"384 32.0 4.0 91.0 67.0 1965.0 15.7 82.0 \n",
"385 38.0 4.0 91.0 67.0 1995.0 16.2 82.0 \n",
"386 25.0 6.0 181.0 110.0 2945.0 16.4 82.0 \n",
"387 38.0 6.0 262.0 85.0 3015.0 17.0 82.0 \n",
"388 26.0 4.0 156.0 92.0 2585.0 14.5 82.0 \n",
"389 22.0 6.0 232.0 112.0 2835.0 14.7 82.0 \n",
"390 32.0 4.0 144.0 96.0 2665.0 13.9 82.0 \n",
"391 36.0 4.0 135.0 84.0 2370.0 13.0 82.0 \n",
"392 27.0 4.0 151.0 90.0 2950.0 17.3 82.0 \n",
"393 27.0 4.0 140.0 86.0 2790.0 15.6 82.0 \n",
"394 44.0 4.0 97.0 52.0 2130.0 24.6 82.0 \n",
"395 32.0 4.0 135.0 84.0 2295.0 11.6 82.0 \n",
"396 28.0 4.0 120.0 79.0 2625.0 18.6 82.0 \n",
"397 31.0 4.0 119.0 82.0 2720.0 19.4 82.0 \n",
"\n",
" origin name \n",
"1 1.0 chevrolet chevelle malibu \n",
"2 1.0 buick skylark 320 \n",
"3 1.0 plymouth satellite \n",
"4 1.0 amc rebel sst \n",
"5 1.0 ford torino \n",
"6 1.0 ford galaxie 500 \n",
"7 1.0 chevrolet impala \n",
"8 1.0 plymouth fury iii \n",
"9 1.0 pontiac catalina \n",
"10 1.0 amc ambassador dpl \n",
"11 1.0 dodge challenger se \n",
"12 1.0 plymouth 'cuda 340 \n",
"13 1.0 chevrolet monte carlo \n",
"14 1.0 buick estate wagon (sw) \n",
"15 3.0 toyota corona mark ii \n",
"16 1.0 plymouth duster \n",
"17 1.0 amc hornet \n",
"18 1.0 ford maverick \n",
"19 3.0 datsun pl510 \n",
"20 2.0 volkswagen 1131 deluxe sedan \n",
"21 2.0 peugeot 504 \n",
"22 2.0 audi 100 ls \n",
"23 2.0 saab 99e \n",
"24 2.0 bmw 2002 \n",
"25 1.0 amc gremlin \n",
"26 1.0 ford f250 \n",
"27 1.0 chevy c20 \n",
"28 1.0 dodge d200 \n",
"29 1.0 hi 1200d \n",
"30 3.0 datsun pl510 \n",
".. ... ... \n",
"368 1.0 chevrolet cavalier \n",
"369 1.0 chevrolet cavalier wagon \n",
"370 1.0 chevrolet cavalier 2-door \n",
"371 1.0 pontiac j2000 se hatchback \n",
"372 1.0 dodge aries se \n",
"373 1.0 pontiac phoenix \n",
"374 1.0 ford fairmont futura \n",
"375 2.0 volkswagen rabbit l \n",
"376 3.0 mazda glc custom l \n",
"377 3.0 mazda glc custom \n",
"378 1.0 plymouth horizon miser \n",
"379 1.0 mercury lynx l \n",
"380 3.0 nissan stanza xe \n",
"381 3.0 honda accord \n",
"382 3.0 toyota corolla \n",
"383 3.0 honda civic \n",
"384 3.0 honda civic (auto) \n",
"385 3.0 datsun 310 gx \n",
"386 1.0 buick century limited \n",
"387 1.0 oldsmobile cutlass ciera (diesel) \n",
"388 1.0 chrysler lebaron medallion \n",
"389 1.0 ford granada l \n",
"390 3.0 toyota celica gt \n",
"391 1.0 dodge charger 2.2 \n",
"392 1.0 chevrolet camaro \n",
"393 1.0 ford mustang gl \n",
"394 2.0 vw pickup \n",
"395 1.0 dodge rampage \n",
"396 1.0 ford ranger \n",
"397 1.0 chevy s-10 \n",
"\n",
"[392 rows x 9 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Auto dataset is in R ISLR package\n",
"islr = importr('ISLR')\n",
"auto_rdf = rdata(islr).fetch('Auto')['Auto']\n",
"auto = pandas2ri.ri2py(auto_rdf)\n",
"display(auto)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SLR MSE = 25.273723993\n"
]
}
],
"source": [
"# Simple linear regression features and response\n",
"features = ['horsepower']\n",
"response = ['mpg']\n",
"X = auto[features]\n",
"y = auto[response]\n",
"\n",
"# Split Auto data into train and test sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)\n",
"\n",
"# Regression\n",
"auto_slr = LinearRegression()\n",
"auto_slr.fit(X_train, y_train)\n",
"\n",
"# Prediction and MSE\n",
"y_pred = auto_slr.predict(X_test)\n",
"print(\"SLR MSE = \", mean_squared_error(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Polynomial regression of degree 2: MSE = 18.8690031195\n"
]
}
],
"source": [
"# Polynomial regression features of degree 2\n",
"poly2 = PolynomialFeatures(degree=2)\n",
"X2 = poly2.fit_transform(X)\n",
"\n",
"# Split Auto data into train and test sets\n",
"X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)\n",
"\n",
"# Regression\n",
"auto_poly2 = LinearRegression()\n",
"auto_poly2.fit(X2_train, y_train)\n",
"\n",
"# Prediction and MSE\n",
"y2_pred = auto_poly2.predict(X2_test)\n",
"print(\"Polynomial regression of degree 2: MSE = \", mean_squared_error(y_test, y2_pred))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Polynomial regression of degree 3: MSE = 18.8333669959\n"
]
}
],
"source": [
"# Polynomial regression features of degree 3\n",
"poly3 = PolynomialFeatures(degree=3)\n",
"X3 = poly3.fit_transform(X)\n",
"\n",
"# Split Auto data into train and test sets\n",
"X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)\n",
"\n",
"# Regression\n",
"auto_poly3 = LinearRegression()\n",
"auto_poly3.fit(X3_train, y_train)\n",
"\n",
"# Prediction and MSE\n",
"y3_pred = auto_poly3.predict(X3_test)\n",
"print(\"Polynomial regression of degree 3: MSE = \", mean_squared_error(y_test, y3_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### 5.3.2 Leave-One-Out Cross-Validation"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Polynomial regression of degree 1:\n",
" MSE = 24.231513517929226\n",
"\n",
"\n",
"Polynomial regression of degree 2:\n",
" MSE = 19.24821312448939\n",
"\n",
"\n",
"Polynomial regression of degree 3:\n",
" MSE = 19.334984064114092\n",
"\n",
"\n",
"Polynomial regression of degree 4:\n",
" MSE = 19.42443030854574\n",
"\n",
"\n",
"Polynomial regression of degree 5:\n",
" MSE = 19.033219754727583\n",
"\n"
]
}
],
"source": [
"# Polynomial regression over degrees from 1 (simple linear) to 5\n",
"auto_poly = LinearRegression()\n",
"loocv = LeaveOneOut()\n",
"\n",
"for poly_deg in range(1, 6):\n",
" print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n",
" poly = PolynomialFeatures(degree=poly_deg)\n",
" X_d = poly.fit_transform(X)\n",
" scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')\n",
" loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n",
" print(' MSE = {}\\n'.format(loocv_mse))"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### 5.3.3 k-Fold Cross-Validation"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Polynomial regression of degree 1:\n",
" MSE = 27.439933652339857\n",
"\n",
"\n",
"Polynomial regression of degree 2:\n",
" MSE = 21.235840055802118\n",
"\n",
"\n",
"Polynomial regression of degree 3:\n",
" MSE = 21.3366061833284\n",
"\n",
"\n",
"Polynomial regression of degree 4:\n",
" MSE = 21.353886987563506\n",
"\n",
"\n",
"Polynomial regression of degree 5:\n",
" MSE = 20.905633737044845\n",
"\n",
"\n",
"Polynomial regression of degree 6:\n",
" MSE = 20.782704427497574\n",
"\n",
"\n",
"Polynomial regression of degree 7:\n",
" MSE = 20.953103378424892\n",
"\n",
"\n",
"Polynomial regression of degree 8:\n",
" MSE = 21.07713162886134\n",
"\n",
"\n",
"Polynomial regression of degree 9:\n",
" MSE = 21.036781313639857\n",
"\n",
"\n",
"Polynomial regression of degree 10:\n",
" MSE = 20.98095645636944\n",
"\n"
]
}
],
"source": [
"# Polynomial regression over degrees from 1 (simple linear) to 10\n",
"auto_poly = LinearRegression()\n",
"kfold = KFold(n_splits=10, random_state=47)\n",
"\n",
"for poly_deg in range(1, 11):\n",
" print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n",
" poly = PolynomialFeatures(degree=poly_deg)\n",
" X_d = poly.fit_transform(X)\n",
" scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')\n",
" loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n",
" print(' MSE = {}\\n'.format(loocv_mse))"
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"source": [
"### 5.3.4 The Bootstrap"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" X | \n",
" Y | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" -0.895251 | \n",
" -0.234924 | \n",
"
\n",
" \n",
" 2 | \n",
" -1.562454 | \n",
" -0.885176 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.417090 | \n",
" 0.271888 | \n",
"
\n",
" \n",
" 4 | \n",
" 1.044356 | \n",
" -0.734198 | \n",
"
\n",
" \n",
" 5 | \n",
" -0.315568 | \n",
" 0.841983 | \n",
"
\n",
" \n",
" 6 | \n",
" -1.737124 | \n",
" -2.037191 | \n",
"
\n",
" \n",
" 7 | \n",
" 1.966413 | \n",
" 1.452957 | \n",
"
\n",
" \n",
" 8 | \n",
" 2.152868 | \n",
" -0.434139 | \n",
"
\n",
" \n",
" 9 | \n",
" -0.081208 | \n",
" 1.450809 | \n",
"
\n",
" \n",
" 10 | \n",
" -0.891782 | \n",
" 0.821016 | \n",
"
\n",
" \n",
" 11 | \n",
" -0.293202 | \n",
" -1.042391 | \n",
"
\n",
" \n",
" 12 | \n",
" 0.505779 | \n",
" 0.608478 | \n",
"
\n",
" \n",
" 13 | \n",
" 0.526751 | \n",
" -0.222493 | \n",
"
\n",
" \n",
" 14 | \n",
" 1.066469 | \n",
" 1.231357 | \n",
"
\n",
" \n",
" 15 | \n",
" 0.294016 | \n",
" 0.628589 | \n",
"
\n",
" \n",
" 16 | \n",
" 0.042549 | \n",
" -1.267574 | \n",
"
\n",
" \n",
" 17 | \n",
" 1.830970 | \n",
" -0.572752 | \n",
"
\n",
" \n",
" 18 | \n",
" -0.326937 | \n",
" -0.487472 | \n",
"
\n",
" \n",
" 19 | \n",
" 0.521480 | \n",
" 2.565985 | \n",
"
\n",
" \n",
" 20 | \n",
" 1.399868 | \n",
" -0.357836 | \n",
"
\n",
" \n",
" 21 | \n",
" -0.645448 | \n",
" -1.412431 | \n",
"
\n",
" \n",
" 22 | \n",
" -0.904352 | \n",
" -0.568305 | \n",
"
\n",
" \n",
" 23 | \n",
" -1.764586 | \n",
" -0.746273 | \n",
"
\n",
" \n",
" 24 | \n",
" -1.810485 | \n",
" 0.493747 | \n",
"
\n",
" \n",
" 25 | \n",
" -1.169899 | \n",
" -2.725281 | \n",
"
\n",
" \n",
" 26 | \n",
" -0.685376 | \n",
" -0.457616 | \n",
"
\n",
" \n",
" 27 | \n",
" 1.090918 | \n",
" 0.014495 | \n",
"
\n",
" \n",
" 28 | \n",
" -0.432340 | \n",
" -0.399831 | \n",
"
\n",
" \n",
" 29 | \n",
" 0.268815 | \n",
" -0.201608 | \n",
"
\n",
" \n",
" 30 | \n",
" -0.851841 | \n",
" -1.741829 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 71 | \n",
" -0.984357 | \n",
" -1.139160 | \n",
"
\n",
" \n",
" 72 | \n",
" -1.384992 | \n",
" 0.702700 | \n",
"
\n",
" \n",
" 73 | \n",
" -0.358843 | \n",
" -1.694513 | \n",
"
\n",
" \n",
" 74 | \n",
" -0.226618 | \n",
" 0.801939 | \n",
"
\n",
" \n",
" 75 | \n",
" -0.941077 | \n",
" -0.733189 | \n",
"
\n",
" \n",
" 76 | \n",
" 2.460336 | \n",
" -0.048373 | \n",
"
\n",
" \n",
" 77 | \n",
" 0.716797 | \n",
" 0.602337 | \n",
"
\n",
" \n",
" 78 | \n",
" -0.248087 | \n",
" -1.018490 | \n",
"
\n",
" \n",
" 79 | \n",
" 1.010773 | \n",
" 0.052978 | \n",
"
\n",
" \n",
" 80 | \n",
" 2.313049 | \n",
" 1.752359 | \n",
"
\n",
" \n",
" 81 | \n",
" 0.835180 | \n",
" 0.985715 | \n",
"
\n",
" \n",
" 82 | \n",
" -1.071903 | \n",
" -1.247298 | \n",
"
\n",
" \n",
" 83 | \n",
" -1.650526 | \n",
" 0.215465 | \n",
"
\n",
" \n",
" 84 | \n",
" -0.600486 | \n",
" -0.420941 | \n",
"
\n",
" \n",
" 85 | \n",
" -0.058529 | \n",
" 0.127621 | \n",
"
\n",
" \n",
" 86 | \n",
" 0.075727 | \n",
" -0.522149 | \n",
"
\n",
" \n",
" 87 | \n",
" -1.157832 | \n",
" 0.590894 | \n",
"
\n",
" \n",
" 88 | \n",
" 1.673606 | \n",
" 0.114623 | \n",
"
\n",
" \n",
" 89 | \n",
" -1.043988 | \n",
" -0.418944 | \n",
"
\n",
" \n",
" 90 | \n",
" 0.014687 | \n",
" -0.558747 | \n",
"
\n",
" \n",
" 91 | \n",
" 0.675322 | \n",
" 1.482630 | \n",
"
\n",
" \n",
" 92 | \n",
" 1.778342 | \n",
" 0.942774 | \n",
"
\n",
" \n",
" 93 | \n",
" -1.295764 | \n",
" -1.085204 | \n",
"
\n",
" \n",
" 94 | \n",
" 0.079602 | \n",
" -0.539101 | \n",
"
\n",
" \n",
" 95 | \n",
" 2.260858 | \n",
" 0.673225 | \n",
"
\n",
" \n",
" 96 | \n",
" 0.479091 | \n",
" 1.454774 | \n",
"
\n",
" \n",
" 97 | \n",
" -0.535020 | \n",
" -0.399175 | \n",
"
\n",
" \n",
" 98 | \n",
" -0.773129 | \n",
" -0.957175 | \n",
"
\n",
" \n",
" 99 | \n",
" 0.403634 | \n",
" 1.396038 | \n",
"
\n",
" \n",
" 100 | \n",
" -0.588496 | \n",
" -0.497285 | \n",
"
\n",
" \n",
"
\n",
"
100 rows × 2 columns
\n",
"
"
],
"text/plain": [
" X Y\n",
"1 -0.895251 -0.234924\n",
"2 -1.562454 -0.885176\n",
"3 -0.417090 0.271888\n",
"4 1.044356 -0.734198\n",
"5 -0.315568 0.841983\n",
"6 -1.737124 -2.037191\n",
"7 1.966413 1.452957\n",
"8 2.152868 -0.434139\n",
"9 -0.081208 1.450809\n",
"10 -0.891782 0.821016\n",
"11 -0.293202 -1.042391\n",
"12 0.505779 0.608478\n",
"13 0.526751 -0.222493\n",
"14 1.066469 1.231357\n",
"15 0.294016 0.628589\n",
"16 0.042549 -1.267574\n",
"17 1.830970 -0.572752\n",
"18 -0.326937 -0.487472\n",
"19 0.521480 2.565985\n",
"20 1.399868 -0.357836\n",
"21 -0.645448 -1.412431\n",
"22 -0.904352 -0.568305\n",
"23 -1.764586 -0.746273\n",
"24 -1.810485 0.493747\n",
"25 -1.169899 -2.725281\n",
"26 -0.685376 -0.457616\n",
"27 1.090918 0.014495\n",
"28 -0.432340 -0.399831\n",
"29 0.268815 -0.201608\n",
"30 -0.851841 -1.741829\n",
".. ... ...\n",
"71 -0.984357 -1.139160\n",
"72 -1.384992 0.702700\n",
"73 -0.358843 -1.694513\n",
"74 -0.226618 0.801939\n",
"75 -0.941077 -0.733189\n",
"76 2.460336 -0.048373\n",
"77 0.716797 0.602337\n",
"78 -0.248087 -1.018490\n",
"79 1.010773 0.052978\n",
"80 2.313049 1.752359\n",
"81 0.835180 0.985715\n",
"82 -1.071903 -1.247298\n",
"83 -1.650526 0.215465\n",
"84 -0.600486 -0.420941\n",
"85 -0.058529 0.127621\n",
"86 0.075727 -0.522149\n",
"87 -1.157832 0.590894\n",
"88 1.673606 0.114623\n",
"89 -1.043988 -0.418944\n",
"90 0.014687 -0.558747\n",
"91 0.675322 1.482630\n",
"92 1.778342 0.942774\n",
"93 -1.295764 -1.085204\n",
"94 0.079602 -0.539101\n",
"95 2.260858 0.673225\n",
"96 0.479091 1.454774\n",
"97 -0.535020 -0.399175\n",
"98 -0.773129 -0.957175\n",
"99 0.403634 1.396038\n",
"100 -0.588496 -0.497285\n",
"\n",
"[100 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Auto dataset is in R ISLR package\n",
"islr = importr('ISLR')\n",
"portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']\n",
"portfolio = pandas2ri.ri2py(portfolio_rdf)\n",
"display(portfolio)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Portfolio alpha = 0.575832074593\n"
]
}
],
"source": [
"# Function to calculate the alpha for portofolio allocation\n",
"def alpha(data):\n",
" \"\"\"\n",
" data: pandas dataframe with two columns X and Y.\n",
" \"\"\"\n",
"\n",
" sigma = data.cov() # covariance matrix\n",
" var_x = sigma.X['X']\n",
" var_y = sigma.Y['Y']\n",
" cov_xy = sigma.X['Y']\n",
" alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)\n",
" return alpha\n",
"alpha_original = alpha(portfolio)\n",
"print(\"Portfolio alpha = \", alpha_original)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" original | \n",
" bias | \n",
" std. error | \n",
"
\n",
" \n",
" \n",
" \n",
" alpha | \n",
" 0.575832 | \n",
" 0.001963 | \n",
" 0.089929 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" original bias std. error\n",
"alpha 0.575832 0.001963 0.089929"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Bootstrap with B=1000 on portfolio data\n",
"N = portfolio.shape[0]\n",
"B = 1000\n",
"portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)\n",
"alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]\n",
"alpha_bias = np.mean(alphas) - alpha_original\n",
"alpha_se = np.std(alphas)\n",
"alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],\n",
" columns=['original', 'bias', 'std. error'], index=['alpha'])\n",
"display(alpha_bootstrap)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"mpg ~ horsepower coefficients:\n",
"\n",
" Intercept horsepower\n",
" 39.935861 -0.157845\n"
]
}
],
"source": [
"# Function to get simple linear regression coefficients for Auto data set\n",
"def auto_coef(data, features, response):\n",
" \"\"\"\n",
" data: pandas dataframe sampled from the Auto data set\n",
" features: a string list of feature names\n",
" response: a string of response names\n",
" \"\"\"\n",
"\n",
" auto_reg = LinearRegression()\n",
" auto_reg.fit(data[features], data[response])\n",
" return [auto_reg.intercept_] + list(auto_reg.coef_)\n",
"\n",
"features = ['horsepower']\n",
"response = 'mpg'\n",
"coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])\n",
"print(\"\\nmpg ~ horsepower coefficients:\\n\\n\", coef_original)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"autoscroll": "json-false",
"collapsed": false,
"ein.tags": [
"worksheet-0"
],
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" original | \n",
" bias | \n",
" std. error | \n",
"
\n",
" \n",
" \n",
" \n",
" Intercept | \n",
" 39.935861 | \n",
" 0.033521 | \n",
" 0.869087 | \n",
"
\n",
" \n",
" horsepower | \n",
" -0.157845 | \n",
" -0.000500 | \n",
" 0.007503 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" original bias std. error\n",
"Intercept 39.935861 0.033521 0.869087\n",
"horsepower -0.157845 -0.000500 0.007503"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Bootstrap with B=1000 on Auto data\n",
"N = auto.shape[0]\n",
"B = 1000\n",
"auto_b = resample(auto, n_samples=N*B, random_state=42)\n",
"coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]\n",
"coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)\n",
"coef_bias = coefs_df.mean() - coef_original\n",
"coef_se = coefs_df.std()\n",
"coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)\n",
"coef_bootstrap.columns = ['original', 'bias', 'std. error']\n",
"display(coef_bootstrap)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
},
"name": "Chapter 5 Resampling Methods.ipynb"
},
"nbformat": 4,
"nbformat_minor": 2
}