{ "cells": [ { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "# Chapter 5 - Resampling Methods" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "- [Lab 5.3.1 The Validation Set Approach](#lab-5.3.1)\n", "- [Lab 5.3.2 Leave-One-Out Cross-Validation](#lab-5.3.2)\n", "- [Lab 5.3.3 k-Fold Cross-Validation](#lab-5.3.3)\n", "- [Lab 5.3.4 The Bootstrap](#lab-5.3.4)" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Imports and Configurations" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "# Standard imports\n", "import warnings\n", "\n", "# Use rpy2 for loading R datasets\n", "from rpy2.robjects.packages import importr\n", "from rpy2.robjects.packages import data as rdata\n", "from rpy2.robjects import pandas2ri\n", "\n", "# Math and data processing\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "\n", "# scikit-learn\n", "from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score\n", "from sklearn.utils import resample\n", "from sklearn.preprocessing import scale, PolynomialFeatures\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error\n", "\n", "# Visulization\n", "from IPython.display import display\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "mpl.style.use('ggplot')" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Lab 5.3.1 The Validation Set Approach" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
118.08.0307.0130.03504. chevelle malibu
215.08.0350.0165.03693.011.570.01.0buick skylark 320
318.08.0318.0150.03436. satellite
416.08.0304.0150.03433. rebel sst
517.08.0302.0140.03449.010.570.01.0ford torino
615.08.0429.0198.04341. galaxie 500
714.08.0454.0220.04354. impala
814.08.0440.0215.04312.08.570.01.0plymouth fury iii
914.08.0455.0225.04425. catalina
1015.08.0390.0190.03850.08.570.01.0amc ambassador dpl
1115.08.0383.0170.03563. challenger se
1214.08.0340.0160.03609. 'cuda 340
1315.08.0400.0150.03761.09.570.01.0chevrolet monte carlo
1414.08.0455.0225.03086. estate wagon (sw)
1524.04.0113.095.02372. corona mark ii
1622.06.0198.095.02833.015.570.01.0plymouth duster
1718.06.0199.097.02774.015.570.01.0amc hornet
1821.06.0200.085.02587. maverick
1927. pl510
2026. 1131 deluxe sedan
2125.04.0110.087.02672.017.570.02.0peugeot 504
2224.04.0107.090.02430.014.570.02.0audi 100 ls
2325.04.0104.095.02375.017.570.02.0saab 99e
2426.04.0121.0113.02234.012.570.02.0bmw 2002
2521.06.0199.090.02648. gremlin
2610.08.0360.0215.04615. f250
2710.08.0307.0200.04376. c20
2811.08.0318.0210.04382.013.570.01.0dodge d200
299.08.0304.0193.04732.018.570.01.0hi 1200d
3027. pl510
36828.04.0112.088.02605.019.682.01.0chevrolet cavalier
36927.04.0112.088.02640.018.682.01.0chevrolet cavalier wagon
37034.04.0112.088.02395. cavalier 2-door
37131.04.0112.085.02575. j2000 se hatchback
37229.04.0135.084.02525. aries se
37327.04.0151.090.02735. phoenix
37424.04.0140.092.02865.016.482.01.0ford fairmont futura
37536.04.0105.074.01980.015.382.02.0volkswagen rabbit l
37637. glc custom l
37731. glc custom
37838.04.0105.063.02125.014.782.01.0plymouth horizon miser
37936. lynx l
38036.04.0120.088.02160.014.582.03.0nissan stanza xe
38136.04.0107.075.02205.014.582.03.0honda accord
38234.04.0108.070.02245.016.982.03.0toyota corolla
38338. civic
38432. civic (auto)
38538. 310 gx
38625.06.0181.0110.02945.016.482.01.0buick century limited
38738.06.0262.085.03015. cutlass ciera (diesel)
38826.04.0156.092.02585.014.582.01.0chrysler lebaron medallion
38922.06.0232.0112.02835.014.782.01.0ford granada l
39032.04.0144.096.02665.013.982.03.0toyota celica gt
39136.04.0135.084.02370. charger 2.2
39227.04.0151.090.02950.017.382.01.0chevrolet camaro
39327.04.0140.086.02790.015.682.01.0ford mustang gl
39444. pickup
39532.04.0135.084.02295.011.682.01.0dodge rampage
39628.04.0120.079.02625.018.682.01.0ford ranger
39731.04.0119.082.02720.019.482.01.0chevy s-10
\n", "

392 rows × 9 columns

\n", "
" ], "text/plain": [ " mpg cylinders displacement horsepower weight acceleration year \\\n", "1 18.0 8.0 307.0 130.0 3504.0 12.0 70.0 \n", "2 15.0 8.0 350.0 165.0 3693.0 11.5 70.0 \n", "3 18.0 8.0 318.0 150.0 3436.0 11.0 70.0 \n", "4 16.0 8.0 304.0 150.0 3433.0 12.0 70.0 \n", "5 17.0 8.0 302.0 140.0 3449.0 10.5 70.0 \n", "6 15.0 8.0 429.0 198.0 4341.0 10.0 70.0 \n", "7 14.0 8.0 454.0 220.0 4354.0 9.0 70.0 \n", "8 14.0 8.0 440.0 215.0 4312.0 8.5 70.0 \n", "9 14.0 8.0 455.0 225.0 4425.0 10.0 70.0 \n", "10 15.0 8.0 390.0 190.0 3850.0 8.5 70.0 \n", "11 15.0 8.0 383.0 170.0 3563.0 10.0 70.0 \n", "12 14.0 8.0 340.0 160.0 3609.0 8.0 70.0 \n", "13 15.0 8.0 400.0 150.0 3761.0 9.5 70.0 \n", "14 14.0 8.0 455.0 225.0 3086.0 10.0 70.0 \n", "15 24.0 4.0 113.0 95.0 2372.0 15.0 70.0 \n", "16 22.0 6.0 198.0 95.0 2833.0 15.5 70.0 \n", "17 18.0 6.0 199.0 97.0 2774.0 15.5 70.0 \n", "18 21.0 6.0 200.0 85.0 2587.0 16.0 70.0 \n", "19 27.0 4.0 97.0 88.0 2130.0 14.5 70.0 \n", "20 26.0 4.0 97.0 46.0 1835.0 20.5 70.0 \n", "21 25.0 4.0 110.0 87.0 2672.0 17.5 70.0 \n", "22 24.0 4.0 107.0 90.0 2430.0 14.5 70.0 \n", "23 25.0 4.0 104.0 95.0 2375.0 17.5 70.0 \n", "24 26.0 4.0 121.0 113.0 2234.0 12.5 70.0 \n", "25 21.0 6.0 199.0 90.0 2648.0 15.0 70.0 \n", "26 10.0 8.0 360.0 215.0 4615.0 14.0 70.0 \n", "27 10.0 8.0 307.0 200.0 4376.0 15.0 70.0 \n", "28 11.0 8.0 318.0 210.0 4382.0 13.5 70.0 \n", "29 9.0 8.0 304.0 193.0 4732.0 18.5 70.0 \n", "30 27.0 4.0 97.0 88.0 2130.0 14.5 71.0 \n", ".. ... ... ... ... ... ... ... \n", "368 28.0 4.0 112.0 88.0 2605.0 19.6 82.0 \n", "369 27.0 4.0 112.0 88.0 2640.0 18.6 82.0 \n", "370 34.0 4.0 112.0 88.0 2395.0 18.0 82.0 \n", "371 31.0 4.0 112.0 85.0 2575.0 16.2 82.0 \n", "372 29.0 4.0 135.0 84.0 2525.0 16.0 82.0 \n", "373 27.0 4.0 151.0 90.0 2735.0 18.0 82.0 \n", "374 24.0 4.0 140.0 92.0 2865.0 16.4 82.0 \n", "375 36.0 4.0 105.0 74.0 1980.0 15.3 82.0 \n", "376 37.0 4.0 91.0 68.0 2025.0 18.2 82.0 \n", "377 31.0 4.0 91.0 68.0 1970.0 17.6 82.0 \n", "378 38.0 4.0 105.0 63.0 2125.0 14.7 82.0 \n", "379 36.0 4.0 98.0 70.0 2125.0 17.3 82.0 \n", "380 36.0 4.0 120.0 88.0 2160.0 14.5 82.0 \n", "381 36.0 4.0 107.0 75.0 2205.0 14.5 82.0 \n", "382 34.0 4.0 108.0 70.0 2245.0 16.9 82.0 \n", "383 38.0 4.0 91.0 67.0 1965.0 15.0 82.0 \n", "384 32.0 4.0 91.0 67.0 1965.0 15.7 82.0 \n", "385 38.0 4.0 91.0 67.0 1995.0 16.2 82.0 \n", "386 25.0 6.0 181.0 110.0 2945.0 16.4 82.0 \n", "387 38.0 6.0 262.0 85.0 3015.0 17.0 82.0 \n", "388 26.0 4.0 156.0 92.0 2585.0 14.5 82.0 \n", "389 22.0 6.0 232.0 112.0 2835.0 14.7 82.0 \n", "390 32.0 4.0 144.0 96.0 2665.0 13.9 82.0 \n", "391 36.0 4.0 135.0 84.0 2370.0 13.0 82.0 \n", "392 27.0 4.0 151.0 90.0 2950.0 17.3 82.0 \n", "393 27.0 4.0 140.0 86.0 2790.0 15.6 82.0 \n", "394 44.0 4.0 97.0 52.0 2130.0 24.6 82.0 \n", "395 32.0 4.0 135.0 84.0 2295.0 11.6 82.0 \n", "396 28.0 4.0 120.0 79.0 2625.0 18.6 82.0 \n", "397 31.0 4.0 119.0 82.0 2720.0 19.4 82.0 \n", "\n", " origin name \n", "1 1.0 chevrolet chevelle malibu \n", "2 1.0 buick skylark 320 \n", "3 1.0 plymouth satellite \n", "4 1.0 amc rebel sst \n", "5 1.0 ford torino \n", "6 1.0 ford galaxie 500 \n", "7 1.0 chevrolet impala \n", "8 1.0 plymouth fury iii \n", "9 1.0 pontiac catalina \n", "10 1.0 amc ambassador dpl \n", "11 1.0 dodge challenger se \n", "12 1.0 plymouth 'cuda 340 \n", "13 1.0 chevrolet monte carlo \n", "14 1.0 buick estate wagon (sw) \n", "15 3.0 toyota corona mark ii \n", "16 1.0 plymouth duster \n", "17 1.0 amc hornet \n", "18 1.0 ford maverick \n", "19 3.0 datsun pl510 \n", "20 2.0 volkswagen 1131 deluxe sedan \n", "21 2.0 peugeot 504 \n", "22 2.0 audi 100 ls \n", "23 2.0 saab 99e \n", "24 2.0 bmw 2002 \n", "25 1.0 amc gremlin \n", "26 1.0 ford f250 \n", "27 1.0 chevy c20 \n", "28 1.0 dodge d200 \n", "29 1.0 hi 1200d \n", "30 3.0 datsun pl510 \n", ".. ... ... \n", "368 1.0 chevrolet cavalier \n", "369 1.0 chevrolet cavalier wagon \n", "370 1.0 chevrolet cavalier 2-door \n", "371 1.0 pontiac j2000 se hatchback \n", "372 1.0 dodge aries se \n", "373 1.0 pontiac phoenix \n", "374 1.0 ford fairmont futura \n", "375 2.0 volkswagen rabbit l \n", "376 3.0 mazda glc custom l \n", "377 3.0 mazda glc custom \n", "378 1.0 plymouth horizon miser \n", "379 1.0 mercury lynx l \n", "380 3.0 nissan stanza xe \n", "381 3.0 honda accord \n", "382 3.0 toyota corolla \n", "383 3.0 honda civic \n", "384 3.0 honda civic (auto) \n", "385 3.0 datsun 310 gx \n", "386 1.0 buick century limited \n", "387 1.0 oldsmobile cutlass ciera (diesel) \n", "388 1.0 chrysler lebaron medallion \n", "389 1.0 ford granada l \n", "390 3.0 toyota celica gt \n", "391 1.0 dodge charger 2.2 \n", "392 1.0 chevrolet camaro \n", "393 1.0 ford mustang gl \n", "394 2.0 vw pickup \n", "395 1.0 dodge rampage \n", "396 1.0 ford ranger \n", "397 1.0 chevy s-10 \n", "\n", "[392 rows x 9 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Auto dataset is in R ISLR package\n", "islr = importr('ISLR')\n", "auto_rdf = rdata(islr).fetch('Auto')['Auto']\n", "auto = pandas2ri.ri2py(auto_rdf)\n", "display(auto)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SLR MSE = 25.273723993\n" ] } ], "source": [ "# Simple linear regression features and response\n", "features = ['horsepower']\n", "response = ['mpg']\n", "X = auto[features]\n", "y = auto[response]\n", "\n", "# Split Auto data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_slr = LinearRegression()\n", "auto_slr.fit(X_train, y_train)\n", "\n", "# Prediction and MSE\n", "y_pred = auto_slr.predict(X_test)\n", "print(\"SLR MSE = \", mean_squared_error(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Polynomial regression of degree 2: MSE = 18.8690031195\n" ] } ], "source": [ "# Polynomial regression features of degree 2\n", "poly2 = PolynomialFeatures(degree=2)\n", "X2 = poly2.fit_transform(X)\n", "\n", "# Split Auto data into train and test sets\n", "X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_poly2 = LinearRegression()\n", "auto_poly2.fit(X2_train, y_train)\n", "\n", "# Prediction and MSE\n", "y2_pred = auto_poly2.predict(X2_test)\n", "print(\"Polynomial regression of degree 2: MSE = \", mean_squared_error(y_test, y2_pred))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Polynomial regression of degree 3: MSE = 18.8333669959\n" ] } ], "source": [ "# Polynomial regression features of degree 3\n", "poly3 = PolynomialFeatures(degree=3)\n", "X3 = poly3.fit_transform(X)\n", "\n", "# Split Auto data into train and test sets\n", "X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=196, random_state=47)\n", "\n", "# Regression\n", "auto_poly3 = LinearRegression()\n", "auto_poly3.fit(X3_train, y_train)\n", "\n", "# Prediction and MSE\n", "y3_pred = auto_poly3.predict(X3_test)\n", "print(\"Polynomial regression of degree 3: MSE = \", mean_squared_error(y_test, y3_pred))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.2 Leave-One-Out Cross-Validation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Polynomial regression of degree 1:\n", " MSE = 24.231513517929226\n", "\n", "\n", "Polynomial regression of degree 2:\n", " MSE = 19.24821312448939\n", "\n", "\n", "Polynomial regression of degree 3:\n", " MSE = 19.334984064114092\n", "\n", "\n", "Polynomial regression of degree 4:\n", " MSE = 19.42443030854574\n", "\n", "\n", "Polynomial regression of degree 5:\n", " MSE = 19.033219754727583\n", "\n" ] } ], "source": [ "# Polynomial regression over degrees from 1 (simple linear) to 5\n", "auto_poly = LinearRegression()\n", "loocv = LeaveOneOut()\n", "\n", "for poly_deg in range(1, 6):\n", " print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n", " poly = PolynomialFeatures(degree=poly_deg)\n", " X_d = poly.fit_transform(X)\n", " scores = cross_val_score(auto_poly, X_d, y, cv=loocv, scoring='neg_mean_squared_error')\n", " loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n", " print(' MSE = {}\\n'.format(loocv_mse))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.3 k-Fold Cross-Validation" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Polynomial regression of degree 1:\n", " MSE = 27.439933652339857\n", "\n", "\n", "Polynomial regression of degree 2:\n", " MSE = 21.235840055802118\n", "\n", "\n", "Polynomial regression of degree 3:\n", " MSE = 21.3366061833284\n", "\n", "\n", "Polynomial regression of degree 4:\n", " MSE = 21.353886987563506\n", "\n", "\n", "Polynomial regression of degree 5:\n", " MSE = 20.905633737044845\n", "\n", "\n", "Polynomial regression of degree 6:\n", " MSE = 20.782704427497574\n", "\n", "\n", "Polynomial regression of degree 7:\n", " MSE = 20.953103378424892\n", "\n", "\n", "Polynomial regression of degree 8:\n", " MSE = 21.07713162886134\n", "\n", "\n", "Polynomial regression of degree 9:\n", " MSE = 21.036781313639857\n", "\n", "\n", "Polynomial regression of degree 10:\n", " MSE = 20.98095645636944\n", "\n" ] } ], "source": [ "# Polynomial regression over degrees from 1 (simple linear) to 10\n", "auto_poly = LinearRegression()\n", "kfold = KFold(n_splits=10, random_state=47)\n", "\n", "for poly_deg in range(1, 11):\n", " print(\"\\nPolynomial regression of degree {}:\".format(poly_deg))\n", " poly = PolynomialFeatures(degree=poly_deg)\n", " X_d = poly.fit_transform(X)\n", " scores = cross_val_score(auto_poly, X_d, y, cv=kfold, scoring='neg_mean_squared_error')\n", " loocv_mse = scores.mean() * (-1) # sign-flip to convert score to MSE\n", " print(' MSE = {}\\n'.format(loocv_mse))" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### 5.3.4 The Bootstrap" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "

100 rows × 2 columns

\n", "
" ], "text/plain": [ " X Y\n", "1 -0.895251 -0.234924\n", "2 -1.562454 -0.885176\n", "3 -0.417090 0.271888\n", "4 1.044356 -0.734198\n", "5 -0.315568 0.841983\n", "6 -1.737124 -2.037191\n", "7 1.966413 1.452957\n", "8 2.152868 -0.434139\n", "9 -0.081208 1.450809\n", "10 -0.891782 0.821016\n", "11 -0.293202 -1.042391\n", "12 0.505779 0.608478\n", "13 0.526751 -0.222493\n", "14 1.066469 1.231357\n", "15 0.294016 0.628589\n", "16 0.042549 -1.267574\n", "17 1.830970 -0.572752\n", "18 -0.326937 -0.487472\n", "19 0.521480 2.565985\n", "20 1.399868 -0.357836\n", "21 -0.645448 -1.412431\n", "22 -0.904352 -0.568305\n", "23 -1.764586 -0.746273\n", "24 -1.810485 0.493747\n", "25 -1.169899 -2.725281\n", "26 -0.685376 -0.457616\n", "27 1.090918 0.014495\n", "28 -0.432340 -0.399831\n", "29 0.268815 -0.201608\n", "30 -0.851841 -1.741829\n", ".. ... ...\n", "71 -0.984357 -1.139160\n", "72 -1.384992 0.702700\n", "73 -0.358843 -1.694513\n", "74 -0.226618 0.801939\n", "75 -0.941077 -0.733189\n", "76 2.460336 -0.048373\n", "77 0.716797 0.602337\n", "78 -0.248087 -1.018490\n", "79 1.010773 0.052978\n", "80 2.313049 1.752359\n", "81 0.835180 0.985715\n", "82 -1.071903 -1.247298\n", "83 -1.650526 0.215465\n", "84 -0.600486 -0.420941\n", "85 -0.058529 0.127621\n", "86 0.075727 -0.522149\n", "87 -1.157832 0.590894\n", "88 1.673606 0.114623\n", "89 -1.043988 -0.418944\n", "90 0.014687 -0.558747\n", "91 0.675322 1.482630\n", "92 1.778342 0.942774\n", "93 -1.295764 -1.085204\n", "94 0.079602 -0.539101\n", "95 2.260858 0.673225\n", "96 0.479091 1.454774\n", "97 -0.535020 -0.399175\n", "98 -0.773129 -0.957175\n", "99 0.403634 1.396038\n", "100 -0.588496 -0.497285\n", "\n", "[100 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Auto dataset is in R ISLR package\n", "islr = importr('ISLR')\n", "portfolio_rdf = rdata(islr).fetch('Portfolio')['Portfolio']\n", "portfolio = pandas2ri.ri2py(portfolio_rdf)\n", "display(portfolio)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Portfolio alpha = 0.575832074593\n" ] } ], "source": [ "# Function to calculate the alpha for portofolio allocation\n", "def alpha(data):\n", " \"\"\"\n", " data: pandas dataframe with two columns X and Y.\n", " \"\"\"\n", "\n", " sigma = data.cov() # covariance matrix\n", " var_x = sigma.X['X']\n", " var_y = sigma.Y['Y']\n", " cov_xy = sigma.X['Y']\n", " alpha = (var_y - cov_xy) / (var_x + var_y - 2 * cov_xy)\n", " return alpha\n", "alpha_original = alpha(portfolio)\n", "print(\"Portfolio alpha = \", alpha_original)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originalbiasstd. error
\n", "
" ], "text/plain": [ " original bias std. error\n", "alpha 0.575832 0.001963 0.089929" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Bootstrap with B=1000 on portfolio data\n", "N = portfolio.shape[0]\n", "B = 1000\n", "portfolio_b = resample(portfolio, n_samples=N*B, random_state=42)\n", "alphas = [alpha(group) for name, group in portfolio_b.groupby(np.arange(N * B) // N)]\n", "alpha_bias = np.mean(alphas) - alpha_original\n", "alpha_se = np.std(alphas)\n", "alpha_bootstrap = pd.DataFrame([[alpha_original, alpha_bias, alpha_se],],\n", " columns=['original', 'bias', 'std. error'], index=['alpha'])\n", "display(alpha_bootstrap)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "mpg ~ horsepower coefficients:\n", "\n", " Intercept horsepower\n", " 39.935861 -0.157845\n" ] } ], "source": [ "# Function to get simple linear regression coefficients for Auto data set\n", "def auto_coef(data, features, response):\n", " \"\"\"\n", " data: pandas dataframe sampled from the Auto data set\n", " features: a string list of feature names\n", " response: a string of response names\n", " \"\"\"\n", "\n", " auto_reg = LinearRegression()\n", " auto_reg.fit(data[features], data[response])\n", " return [auto_reg.intercept_] + list(auto_reg.coef_)\n", "\n", "features = ['horsepower']\n", "response = 'mpg'\n", "coef_original = pd.DataFrame([auto_coef(auto, features, response)], columns=['Intercept'] + features, index=[''])\n", "print(\"\\nmpg ~ horsepower coefficients:\\n\\n\", coef_original)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originalbiasstd. error
\n", "
" ], "text/plain": [ " original bias std. error\n", "Intercept 39.935861 0.033521 0.869087\n", "horsepower -0.157845 -0.000500 0.007503" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Bootstrap with B=1000 on Auto data\n", "N = auto.shape[0]\n", "B = 1000\n", "auto_b = resample(auto, n_samples=N*B, random_state=42)\n", "coefs = [auto_coef(group, features, response) for name, group in auto_b.groupby(np.arange(N * B) // N)]\n", "coefs_df = pd.DataFrame(coefs, columns=['Intercept'] + features)\n", "coef_bias = coefs_df.mean() - coef_original\n", "coef_se = coefs_df.std()\n", "coef_bootstrap = pd.concat([coef_original.T.copy(), coef_bias.T, coef_se], axis=1)\n", "coef_bootstrap.columns = ['original', 'bias', 'std. error']\n", "display(coef_bootstrap)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" }, "name": "Chapter 5 Resampling Methods.ipynb" }, "nbformat": 4, "nbformat_minor": 2 }