{ "cells": [ { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "# Chapter 8 Tree-Based Methods" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "- [Lab 8.3.1 Fitting Classification Trees](#lab-8.3.1)\n", "- [Lab 8.3.2 Fitting Regression Trees](#lab-8.3.2)\n", "- [Lab 8.3.3 Bagging and Random Forests](#lab-8.3.3)\n", "- [Lab 8.3.4 Boosting](#lab-8.3.4)" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Imports and Configurations" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "# Use rpy2 for loading R datasets\n", "from rpy2.robjects.packages import importr\n", "from rpy2.robjects.packages import data as rdata\n", "from rpy2.robjects import pandas2ri\n", "\n", "# Math and data processing\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "\n", "# StatsModels\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "\n", "# scikit-learn\n", "from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz\n", "from sklearn.ensemble import BaggingClassifier, BaggingRegressor\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.preprocessing import scale\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import confusion_matrix, classification_report\n", "\n", "# Visulization\n", "from IPython.display import display\n", "from IPython.display import Image\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "mpl.style.use('ggplot')\n", "import pydotplus" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "source": [ "### Lab 8.3.1 Fitting Classification Trees" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
211.22111.048.016.0260.083.0Good65.010.0YesYes
310.06113.035.010.0269.080.0Medium59.012.0YesYes
47.40117.0100.04.0466.097.0Medium55.014.0YesYes
\n", "
" ], "text/plain": [ " Sales CompPrice Income Advertising Population Price ShelveLoc Age \\\n", "1 9.50 138.0 73.0 11.0 276.0 120.0 Bad 42.0 \n", "2 11.22 111.0 48.0 16.0 260.0 83.0 Good 65.0 \n", "3 10.06 113.0 35.0 10.0 269.0 80.0 Medium 59.0 \n", "4 7.40 117.0 100.0 4.0 466.0 97.0 Medium 55.0 \n", "5 4.15 141.0 64.0 3.0 340.0 128.0 Bad 38.0 \n", "\n", " Education Urban US \n", "1 17.0 Yes Yes \n", "2 10.0 Yes Yes \n", "3 12.0 Yes Yes \n", "4 14.0 Yes Yes \n", "5 13.0 Yes No " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# wage dataset is in R ISLR package\n", "islr = importr('ISLR')\n", "carseats_rdf = rdata(islr).fetch('Carseats')['Carseats']\n", "carseats = pandas2ri.ri2py(carseats_rdf)\n", "display(carseats.head(5))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "autoscroll": "json-false", "collapsed": false, "ein.tags": [ "worksheet-0" ], "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
19.50138.073.011.0276.0120.0042.017.0111
211.22111.048.016.0260.083.0165.010.0111
310.06113.035.010.0269.080.0259.012.0111
47.40117.0100.04.0466.097.0255.014.0110
54.15141.064.03.0340.0128.0038.013.0100
\n", "