{ "cells": [ { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "# Chapter 5 - Resampling Methods" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "- [Lab 5.3.1 The Validation Set Approach](#lab-5.3.1)\n", "- [Lab 5.3.2 Leave-One-Out Cross-Validation](#lab-5.3.2)\n", "- [Lab 5.3.3 k-Fold Cross-Validation](#lab-5.3.3)\n", "- [Lab 5.3.4 The Bootstrap](#lab-5.3.4)" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Imports and Configurations" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "# Standard imports\n", "import warnings\n", "\n", "# Use rpy2 for loading R datasets\n", "from rpy2.robjects.packages import importr\n", "from rpy2.robjects.packages import data as rdata\n", "from rpy2.robjects import pandas2ri\n", "\n", "# Math and data processing\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "\n", "# scikit-learn\n", "from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score\n", "from sklearn.utils import resample\n", "from sklearn.preprocessing import scale, PolynomialFeatures\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error\n", "\n", "# Visulization\n", "from IPython.display import display\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "mpl.style.use('ggplot')" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Lab 5.3.1 The Validation Set Approach" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mpgcylindersdisplacementhorsepowerweightaccelerationyearoriginname
118.08.0307.0130.03504.012.070.01.0chevrolet chevelle malibu
215.08.0350.0165.03693.011.570.01.0buick skylark 320
318.08.0318.0150.03436.011.070.01.0plymouth satellite
416.08.0304.0150.03433.012.070.01.0amc rebel sst
517.08.0302.0140.03449.010.570.01.0ford torino
615.08.0429.0198.04341.010.070.01.0ford galaxie 500
714.08.0454.0220.04354.09.070.01.0chevrolet impala
814.08.0440.0215.04312.08.570.01.0plymouth fury iii
914.08.0455.0225.04425.010.070.01.0pontiac catalina
1015.08.0390.0190.03850.08.570.01.0amc ambassador dpl
1115.08.0383.0170.03563.010.070.01.0dodge challenger se
1214.08.0340.0160.03609.08.070.01.0plymouth 'cuda 340
1315.08.0400.0150.03761.09.570.01.0chevrolet monte carlo
1414.08.0455.0225.03086.010.070.01.0buick estate wagon (sw)
1524.04.0113.095.02372.015.070.03.0toyota corona mark ii
1622.06.0198.095.02833.015.570.01.0plymouth duster
1718.06.0199.097.02774.015.570.01.0amc hornet
1821.06.0200.085.02587.016.070.01.0ford maverick
1927.04.097.088.02130.014.570.03.0datsun pl510
2026.04.097.046.01835.020.570.02.0volkswagen 1131 deluxe sedan
2125.04.0110.087.02672.017.570.02.0peugeot 504
2224.04.0107.090.02430.014.570.02.0audi 100 ls
2325.04.0104.095.02375.017.570.02.0saab 99e
2426.04.0121.0113.02234.012.570.02.0bmw 2002
2521.06.0199.090.02648.015.070.01.0amc gremlin
2610.08.0360.0215.04615.014.070.01.0ford f250
2710.08.0307.0200.04376.015.070.01.0chevy c20
2811.08.0318.0210.04382.013.570.01.0dodge d200
299.08.0304.0193.04732.018.570.01.0hi 1200d
3027.04.097.088.02130.014.571.03.0datsun pl510
..............................
36828.04.0112.088.02605.019.682.01.0chevrolet cavalier
36927.04.0112.088.02640.018.682.01.0chevrolet cavalier wagon
37034.04.0112.088.02395.018.082.01.0chevrolet cavalier 2-door
37131.04.0112.085.02575.016.282.01.0pontiac j2000 se hatchback
37229.04.0135.084.02525.016.082.01.0dodge aries se
37327.04.0151.090.02735.018.082.01.0pontiac phoenix
37424.04.0140.092.02865.016.482.01.0ford fairmont futura
37536.04.0105.074.01980.015.382.02.0volkswagen rabbit l
37637.04.091.068.02025.018.282.03.0mazda glc custom l
37731.04.091.068.01970.017.682.03.0mazda glc custom
37838.04.0105.063.02125.014.782.01.0plymouth horizon miser
37936.04.098.070.02125.017.382.01.0mercury lynx l
38036.04.0120.088.02160.014.582.03.0nissan stanza xe
38136.04.0107.075.02205.014.582.03.0honda accord
38234.04.0108.070.02245.016.982.03.0toyota corolla
38338.04.091.067.01965.015.082.03.0honda civic
38432.04.091.067.01965.015.782.03.0honda civic (auto)
38538.04.091.067.01995.016.282.03.0datsun 310 gx
38625.06.0181.0110.02945.016.482.01.0buick century limited
38738.06.0262.085.03015.017.082.01.0oldsmobile cutlass ciera (diesel)
38826.04.0156.092.02585.014.582.01.0chrysler lebaron medallion
38922.06.0232.0112.02835.014.782.01.0ford granada l
39032.04.0144.096.02665.013.982.03.0toyota celica gt
39136.04.0135.084.02370.013.082.01.0dodge charger 2.2
39227.04.0151.090.02950.017.382.01.0chevrolet camaro
39327.04.0140.086.02790.015.682.01.0ford mustang gl
39444.04.097.052.02130.024.682.02.0vw pickup
39532.04.0135.084.02295.011.682.01.0dodge rampage
39628.04.0120.079.02625.018.682.01.0ford ranger
39731.04.0119.082.02720.019.482.01.0chevy s-10
\n", "

392 rows × 9 columns

\n", "
" ], "text/plain": [ " mpg cylinders displacement horsepower weight acceleration year \\\n", "1 18.0 8.0 307.0 130.0 3504.0 12.0 70.0 \n", "2 15.0 8.0 350.0 165.0 3693.0 11.5 70.0 \n", "3 18.0 8.0 318.0 150.0 3436.0 11.0 70.0 \n", "4 16.0 8.0 304.0 150.0 3433.0 12.0 70.0 \n", "5 17.0 8.0 302.0 140.0 3449.0 10.5 70.0 \n", "6 15.0 8.0 429.0 198.0 4341.0 10.0 70.0 \n", "7 14.0 8.0 454.0 220.0 4354.0 9.0 70.0 \n", "8 14.0 8.0 440.0 215.0 4312.0 8.5 70.0 \n", "9 14.0 8.0 455.0 225.0 4425.0 10.0 70.0 \n", "10 15.0 8.0 390.0 190.0 3850.0 8.5 70.0 \n", "11 15.0 8.0 383.0 170.0 3563.0 10.0 70.0 \n", "12 14.0 8.0 340.0 160.0 3609.0 8.0 70.0 \n", "13 15.0 8.0 400.0 150.0 3761.0 9.5 70.0 \n", "14 14.0 8.0 455.0 225.0 3086.0 10.0 70.0 \n", "15 24.0 4.0 113.0 95.0 2372.0 15.0 70.0 \n", "16 22.0 6.0 198.0 95.0 2833.0 15.5 70.0 \n", "17 18.0 6.0 199.0 97.0 2774.0 15.5 70.0 \n", "18 21.0 6.0 200.0 85.0 2587.0 16.0 70.0 \n", "19 27.0 4.0 97.0 88.0 2130.0 14.5 70.0 \n", "20 26.0 4.0 97.0 46.0 1835.0 20.5 70.0 \n", "21 25.0 4.0 110.0 87.0 2672.0 17.5 70.0 \n", "22 24.0 4.0 107.0 90.0 2430.0 14.5 70.0 \n", "23 25.0 4.0 104.0 95.0 2375.0 17.5 70.0 \n", "24 26.0 4.0 121.0 113.0 2234.0 12.5 70.0 \n", "25 21.0 6.0 199.0 90.0 2648.0 15.0 70.0 \n", "26 10.0 8.0 360.0 215.0 4615.0 14.0 70.0 \n", "27 10.0 8.0 307.0 200.0 4376.0 15.0 70.0 \n", "28 11.0 8.0 318.0 210.0 4382.0 13.5 70.0 \n", "29 9.0 8.0 304.0 193.0 4732.0 18.5 70.0 \n", "30 27.0 4.0 97.0 88.0 2130.0 14.5 71.0 \n", ".. ... ... ... ... ... ... ... \n", "368 28.0 4.0 112.0 88.0 2605.0 19.6 82.0 \n", "369 27.0 4.0 112.0 88.0 2640.0 18.6 82.0 \n", "370 34.0 4.0 112.0 88.0 2395.0 18.0 82.0 \n", "371 31.0 4.0 112.0 85.0 2575.0 16.2 82.0 \n", "372 29.0 4.0 135.0 84.0 2525.0 16.0 82.0 \n", "373 27.0 4.0 151.0 90.0 2735.0 18.0 82.0 \n", "374 24.0 4.0 140.0 92.0 2865.0 16.4 82.0 \n", "375 36.0 4.0 105.0 74.0 1980.0 15.3 82.0 \n", "376 37.0 4.0 91.0 68.0 2025.0 18.2 82.0 \n", "377 31.0 4.0 91.0 68.0 1970.0 17.6 82.0 \n", "378 38.0 4.0 105.0 63.0 2125.0 14.7 82.0 \n", "379 36.0 4.0 98.0 70.0 2125.0 17.3 82.0 \n", "380 36.0 4.0 120.0 88.0 2160.0 14.5 82.0 \n", "381 36.0 4.0 107.0 75.0 2205.0 14.5 82.0 \n", "382 34.0 4.0 108.0 70.0 2245.0 16.9 82.0 \n", "383 38.0 4.0 91.0 67.0 1965.0 15.0 82.0 \n", "384 32.0 4.0 91.0 67.0 1965.0 15.7 82.0 \n", "385 38.0 4.0 91.0 67.0 1995.0 16.2 82.0 \n", "386 25.0 6.0 181.0 110.0 2945.0 16.4 82.0 \n", "387 38.0 6.0 262.0 85.0 3015.0 17.0 82.0 \n", "388 26.0 4.0 156.0 92.0 2585.0 14.5 82.0 \n", "389 22.0 6.0 232.0 112.0 2835.0 14.7 82.0 \n", "390 32.0 4.0 144.0 96.0 2665.0 13.9 82.0 \n", "391 36.0 4.0 135.0 84.0 2370.0 13.0 82.0 \n", "392 27.0 4.0 151.0 90.0 2950.0 17.3 82.0 \n", "393 27.0 4.0 140.0 86.0 2790.0 15.6 82.0 \n", "394 44.0 4.0 97.0 52.0 2130.0 24.6 82.0 \n", "395 32.0 4.0 135.0 84.0 2295.0 11.6 82.0 \n", "396 28.0 4.0 120.0 79.0 2625.0 18.6 82.0 \n", "397 31.0 4.0 119.0 82.0 2720.0 19.4 82.0 \n", "\n", " origin name \n", "1 1.0 chevrolet chevelle malibu \n", "2 1.0 buick skylark 320 \n", "3 1.0 plymouth satellite \n", "4 1.0 amc rebel sst \n", "5 1.0 ford torino \n", "6 1.0 ford galaxie 500 \n", "7 1.0 chevrolet impala \n", "8 1.0 plymouth fury iii \n", "9 1.0 pontiac catalina \n", "10 1.0 amc ambassador dpl \n", "11 1.0 dodge challenger se \n", "12 1.0 plymouth 'cuda 340 \n", "13 1.0 chevrolet monte carlo \n", "14 1.0 buick estate wagon (sw) \n", "15 3.0 toyota corona mark ii \n", "16 1.0 plymouth duster \n", "17 1.0 amc hornet \n", "18 1.0 ford maverick \n", "19 3.0 datsun pl510 \n", "20 2.0 volkswagen 1131 deluxe sedan \n", "21 2.0 peugeot 504 \n", "22 2.0 audi 100 ls \n", "23 2.0 saab 99e \n", "24 2.0 bmw 2002 \n", "25 1.0 amc gremlin \n", "26 1.0 ford f250 \n", "27 1.0 chevy c20 \n", "28 1.0 dodge d200 \n", "29 1.0 hi 1200d \n", "30 3.0 datsun pl510 \n", ".. ... ... \n", "368 1.0 chevrolet cavalier \n", "369 1.0 chevrolet cavalier wagon \n", "370 1.0 chevrolet cavalier 2-door \n", "371 1.0 pontiac j2000 se hatchback \n", "372 1.0 dodge aries se \n", "373 1.0 pontiac phoenix \n", "374 1.0 ford fairmont futura \n", "375 2.0 volkswagen rabbit l \n", "376 3.0 mazda glc custom l \n", "377 3.0 mazda glc custom \n", "378 1.0 plymouth horizon miser \n", "379 1.0 mercury lynx l \n", "380 3.0 nissan stanza xe \n", "381 3.0 honda accord \n", "382 3.0 toyota corolla \n", "383 3.0 honda civic \n", "384 3.0 honda civic (auto) \n", "385 3.0 datsun 310 gx \n", "386 1.0 buick century limited \n", "387 1.0 oldsmobile cutlass ciera (diesel) \n", "388 1.0 chrysler lebaron medallion \n", "389 1.0 ford granada l \n", "390 3.0 toyota celica gt \n", "391 1.0 dodge charger 2.2 \n", "392 1.0 chevrolet camaro \n", "393 1.0 ford mustang gl \n", "394 2.0 vw pickup \n", "395 1.0 dodge rampage \n", "396 1.0 ford ranger \n", "397 1.0 chevy s-10 \n", "\n", "[392 rows x 9 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Auto dataset is in R ISLR package\n", "islr = importr('ISLR')\n", "auto_rdf = rdata(islr).fetch('Auto')['Auto']\n", "auto = pandas2ri.ri2py(auto_rdf)\n", "display(auto)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SLR MSE = 25.273723993\n" ] } ], "source": [ "# Simple linear regression features and response\n", "features = ['horsepower']\n", "response = ['mpg']\n", "X = auto[features]\n", "y = auto[response]\n", "\n", "# Split Auto data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_slr = LinearRegression()\n", "auto_slr.fit(X_train, y_train)\n", "\n", "# Prediction and MSE\n", "y_pred = auto_slr.predict(X_test)\n", "print(\"SLR MSE = \", mean_squared_error(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Polynomial regression of degree 2: MSE = 18.8690031195\n" ] } ], "source": [ "# Polynomial regression features of degree 2\n", "poly2 = PolynomialFeatures(degree=2)\n", "X2 = poly2.fit_transform(X)\n", "\n", "# Split Auto data into train and test sets\n", "X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_poly2 = LinearRegression()\n", "auto_poly2.fit(X2_train, y_train)\n", "\n", "# Prediction and MSE\n", "y2_pred = auto_poly2.predict(X2_test)\n", "print(\"Polynomial regression of degree 2: MSE = \", mean_squared_error(y_test, y2_pred))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Polynomial regression of degree 3: MSE = 18.8333669959\n" ] } ], "source": [ "# Polynomial regression features of degree 3\n", "poly3 = PolynomialFeatures(degree=3)\n", "X3 = poly3.fit_transform(X)\n", "\n", "# Split Auto data into train and test sets\n", "X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_poly3 = LinearRegression()\n", "auto_poly3.fit(X3_train, y_train)\n", "\n", "# Prediction and MSE\n", "y3_pred = auto_poly3.predict(X3_test)\n", "print(\"Polynomial regression of degree 3: MSE = \", mean_squared_error(y_test, y3_pred))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.2 Leave-One-Out Cross-Validation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Polynomial regression of degree 1:\n", " MSE = 24.231513517929226\n", "\n", "\n", "Polynomial regression of degree 2:\n", " MSE = 19.24821312448939\n", "\n", "\n", "Polynomial regression of degree 3:\n", " MSE = 19.334984064114092\n", "\n", "\n", "Polynomial regression of degree 4:\n", " MSE = 19.42443030854574\n", "\n", "\n", "Polynomial regression of degree 5:\n", " MSE = 19.033219754727583\n", "\n" ] } ], "source": [ "# Polynomial regression over degrees from 1 (simple linear) to 5\n", "auto_poly = LinearRegression()\n", "loocv = LeaveOneOut()\n", "\n", "for poly_deg in range(1, 6):\n", " print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n", " poly = PolynomialFeatures(degree=poly_deg)\n", " X_d = poly.fit_transform(X)\n", " scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')\n", " loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n", " print(' MSE = {}\\n'.format(loocv_mse))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.3 k-Fold Cross-Validation" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Polynomial regression of degree 1:\n", " MSE = 27.439933652339857\n", "\n", "\n", "Polynomial regression of degree 2:\n", " MSE = 21.235840055802118\n", "\n", "\n", "Polynomial regression of degree 3:\n", " MSE = 21.3366061833284\n", "\n", "\n", "Polynomial regression of degree 4:\n", " MSE = 21.353886987563506\n", "\n", "\n", "Polynomial regression of degree 5:\n", " MSE = 20.905633737044845\n", "\n", "\n", "Polynomial regression of degree 6:\n", " MSE = 20.782704427497574\n", "\n", "\n", "Polynomial regression of degree 7:\n", " MSE = 20.953103378424892\n", "\n", "\n", "Polynomial regression of degree 8:\n", " MSE = 21.07713162886134\n", "\n", "\n", "Polynomial regression of degree 9:\n", " MSE = 21.036781313639857\n", "\n", "\n", "Polynomial regression of degree 10:\n", " MSE = 20.98095645636944\n", "\n" ] } ], "source": [ "# Polynomial regression over degrees from 1 (simple linear) to 10\n", "auto_poly = LinearRegression()\n", "kfold = KFold(n_splits=10, random_state=47)\n", "\n", "for poly_deg in range(1, 11):\n", " print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n", " poly = PolynomialFeatures(degree=poly_deg)\n", " X_d = poly.fit_transform(X)\n", " scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')\n", " loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n", " print(' MSE = {}\\n'.format(loocv_mse))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.4 The Bootstrap" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XY
1-0.895251-0.234924
2-1.562454-0.885176
3-0.4170900.271888
41.044356-0.734198
5-0.3155680.841983
6-1.737124-2.037191
71.9664131.452957
82.152868-0.434139
9-0.0812081.450809
10-0.8917820.821016
11-0.293202-1.042391
120.5057790.608478
130.526751-0.222493
141.0664691.231357
150.2940160.628589
160.042549-1.267574
171.830970-0.572752
18-0.326937-0.487472
190.5214802.565985
201.399868-0.357836
21-0.645448-1.412431
22-0.904352-0.568305
23-1.764586-0.746273
24-1.8104850.493747
25-1.169899-2.725281
26-0.685376-0.457616
271.0909180.014495
28-0.432340-0.399831
290.268815-0.201608
30-0.851841-1.741829
.........
71-0.984357-1.139160
72-1.3849920.702700
73-0.358843-1.694513
74-0.2266180.801939
75-0.941077-0.733189
762.460336-0.048373
770.7167970.602337
78-0.248087-1.018490
791.0107730.052978
802.3130491.752359
810.8351800.985715
82-1.071903-1.247298
83-1.6505260.215465
84-0.600486-0.420941
85-0.0585290.127621
860.075727-0.522149
87-1.1578320.590894
881.6736060.114623
89-1.043988-0.418944
900.014687-0.558747
910.6753221.482630
921.7783420.942774
93-1.295764-1.085204
940.079602-0.539101
952.2608580.673225
960.4790911.454774
97-0.535020-0.399175
98-0.773129-0.957175
990.4036341.396038
100-0.588496-0.497285
\n", "

100 rows × 2 columns

\n", "
" ], "text/plain": [ " X Y\n", "1 -0.895251 -0.234924\n", "2 -1.562454 -0.885176\n", "3 -0.417090 0.271888\n", "4 1.044356 -0.734198\n", "5 -0.315568 0.841983\n", "6 -1.737124 -2.037191\n", "7 1.966413 1.452957\n", "8 2.152868 -0.434139\n", "9 -0.081208 1.450809\n", "10 -0.891782 0.821016\n", "11 -0.293202 -1.042391\n", "12 0.505779 0.608478\n", "13 0.526751 -0.222493\n", "14 1.066469 1.231357\n", "15 0.294016 0.628589\n", "16 0.042549 -1.267574\n", "17 1.830970 -0.572752\n", "18 -0.326937 -0.487472\n", "19 0.521480 2.565985\n", "20 1.399868 -0.357836\n", "21 -0.645448 -1.412431\n", "22 -0.904352 -0.568305\n", "23 -1.764586 -0.746273\n", "24 -1.810485 0.493747\n", "25 -1.169899 -2.725281\n", "26 -0.685376 -0.457616\n", "27 1.090918 0.014495\n", "28 -0.432340 -0.399831\n", "29 0.268815 -0.201608\n", "30 -0.851841 -1.741829\n", ".. ... ...\n", "71 -0.984357 -1.139160\n", "72 -1.384992 0.702700\n", "73 -0.358843 -1.694513\n", "74 -0.226618 0.801939\n", "75 -0.941077 -0.733189\n", "76 2.460336 -0.048373\n", "77 0.716797 0.602337\n", "78 -0.248087 -1.018490\n", "79 1.010773 0.052978\n", "80 2.313049 1.752359\n", "81 0.835180 0.985715\n", "82 -1.071903 -1.247298\n", "83 -1.650526 0.215465\n", "84 -0.600486 -0.420941\n", "85 -0.058529 0.127621\n", "86 0.075727 -0.522149\n", "87 -1.157832 0.590894\n", "88 1.673606 0.114623\n", "89 -1.043988 -0.418944\n", "90 0.014687 -0.558747\n", "91 0.675322 1.482630\n", "92 1.778342 0.942774\n", "93 -1.295764 -1.085204\n", "94 0.079602 -0.539101\n", "95 2.260858 0.673225\n", "96 0.479091 1.454774\n", "97 -0.535020 -0.399175\n", "98 -0.773129 -0.957175\n", "99 0.403634 1.396038\n", "100 -0.588496 -0.497285\n", "\n", "[100 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Auto dataset is in R ISLR package\n", "islr = importr('ISLR')\n", "portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']\n", "portfolio = pandas2ri.ri2py(portfolio_rdf)\n", "display(portfolio)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Portfolio alpha = 0.575832074593\n" ] } ], "source": [ "# Function to calculate the alpha for portofolio allocation\n", "def alpha(data):\n", " \"\"\"\n", " data: pandas dataframe with two columns X and Y.\n", " \"\"\"\n", "\n", " sigma = data.cov() # covariance matrix\n", " var_x = sigma.X['X']\n", " var_y = sigma.Y['Y']\n", " cov_xy = sigma.X['Y']\n", " alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)\n", " return alpha\n", "alpha_original = alpha(portfolio)\n", "print(\"Portfolio alpha = \", alpha_original)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originalbiasstd. error
alpha0.5758320.0019630.089929
\n", "
" ], "text/plain": [ " original bias std. error\n", "alpha 0.575832 0.001963 0.089929" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Bootstrap with B=1000 on portfolio data\n", "N = portfolio.shape[0]\n", "B = 1000\n", "portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)\n", "alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]\n", "alpha_bias = np.mean(alphas) - alpha_original\n", "alpha_se = np.std(alphas)\n", "alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],\n", " columns=['original', 'bias', 'std. error'], index=['alpha'])\n", "display(alpha_bootstrap)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "mpg ~ horsepower coefficients:\n", "\n", " Intercept horsepower\n", " 39.935861 -0.157845\n" ] } ], "source": [ "# Function to get simple linear regression coefficients for Auto data set\n", "def auto_coef(data, features, response):\n", " \"\"\"\n", " data: pandas dataframe sampled from the Auto data set\n", " features: a string list of feature names\n", " response: a string of response names\n", " \"\"\"\n", "\n", " auto_reg = LinearRegression()\n", " auto_reg.fit(data[features], data[response])\n", " return [auto_reg.intercept_] + list(auto_reg.coef_)\n", "\n", "features = ['horsepower']\n", "response = 'mpg'\n", "coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])\n", "print(\"\\nmpg ~ horsepower coefficients:\\n\\n\", coef_original)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originalbiasstd. error
Intercept39.9358610.0335210.869087
horsepower-0.157845-0.0005000.007503
\n", "
" ], "text/plain": [ " original bias std. error\n", "Intercept 39.935861 0.033521 0.869087\n", "horsepower -0.157845 -0.000500 0.007503" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Bootstrap with B=1000 on Auto data\n", "N = auto.shape[0]\n", "B = 1000\n", "auto_b = resample(auto, n_samples=N*B, random_state=42)\n", "coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]\n", "coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)\n", "coef_bias = coefs_df.mean() - coef_original\n", "coef_se = coefs_df.std()\n", "coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)\n", "coef_bootstrap.columns = ['original', 'bias', 'std. error']\n", "display(coef_bootstrap)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" }, "name": "Chapter 5 Resampling Methods.ipynb" }, "nbformat": 4, "nbformat_minor": 2 }