--- title: "Chapter 11 Tree-Based Methods" date: "`r Sys.Date()`" output: html_document: toc: true toc_depth: 3 toc_float: collapsed: true smooth_scroll: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) ``` # Load R Packages ```{r, message = FALSE, warning=FALSE, results='hide'} # install packages from CRAN p_needed <- c('rpart', 'caret', 'partykit', 'pROC', 'dplyr', 'ipred', 'e1071', 'randomForest', 'gbm') packages <- rownames(installed.packages()) p_to_install <- p_needed[!(p_needed %in% packages)] if (length(p_to_install) > 0) { install.packages(p_to_install) } lapply(p_needed, require, character.only = TRUE) ``` # Regression Tree ```{r} # first load data dat <- read.csv("http://bit.ly/2P5gTw4") head(dat) ``` ```{r} # data cleaning: delete wrong observations dat <- subset(dat, store_exp > 0 & online_exp > 0) # use the 10 survey questions as predictors trainx <- dat[, grep("Q", names(dat))] # use the sum of store and online expenditure as response variable # total expenditure = store expenditure + online expenditure trainy <- dat$store_exp + dat$online_exp ``` ```{r} # set fixed random seed such that the results can be duplicated set.seed(100) # fit the model using caret library function train to find the best tree model # based on RMSE on the cross-validation set rpartTune <- train(trainx, trainy, method = "rpart2", tuneLength = 10, trControl = trainControl(method = "cv")) plot(rpartTune) ``` ```{r} # from the graph above, RMSE doesn’t change much when the maximum is larger than 2. # So we set the maximum depth to be 2 and refit the model: rpartTree <- rpart(trainy ~ ., data = trainx, maxdepth = 2) print(rpartTree) ``` To visualize the tree, you can convert rpart object to party object using `partykit` then use `plot()` function: ```{r} rpartTree2 <- partykit::as.party(rpartTree) plot(rpartTree2) ``` # Decision Tree ```{r} dat <- read.csv("http://bit.ly/2P5gTw4") head(dat) ``` In the code below, we showed two ways to deal with categorical data, use it as it is (approach 1) and covert it to dummy variables (approach 2) ```{r} # use the 10 survey questions as predictors trainx1 <- dat[, grep("Q", names(dat))] # add a categorical predictor # use two ways to treat categorical predictor # trainx1: use approach 1, without encoding trainx1$segment <- dat$segment head(trainx1) ``` ```{r} # trainx2: use approach 2, encode it to a set of dummy variables dumMod<-dummyVars(~., data=trainx1, # Combine the previous variable name and the level name # as the new dummy variable name levelsOnly=F) trainx2 <- predict(dumMod,trainx1) head(trainx2) ``` ```{r} # the response variable is gender trainy <- dat$gender head(trainy) ``` ```{r} ## model search using approach 1 for categorical variable rpartTune1 <- caret::train(trainx1, trainy, method = "rpart", tuneLength = 30, metric = "ROC", trControl = trainControl(method = "cv", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)) ``` ```{r} rpartTune1 ``` ```{r} rpartRoc <- pROC::roc(response = rpartTune1$pred$obs, predictor = rpartTune1$pred$Female, levels = rev(levels(rpartTune1$pred$obs))) rpartRoc ``` ```{r} ## model search using approach 2 for categorical variable rpartTune2 <- caret::train(trainx2, trainy, method = "rpart", tuneLength = 30, metric = "ROC", trControl = trainControl(method = "cv", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)) rpartTune2 ``` ```{r} rpartFactorRoc <- pROC::roc(response = rpartTune2$pred$obs, predictor = rpartTune2$pred$Female, levels = rev(levels(rpartTune1$pred$obs)) ) rpartFactorRoc ``` ```{r} # compare the ROC for two approaches plot.roc(rpartRoc, type = "s", print.thres = c(.5), print.thres.pch = 3, print.thres.pattern = "", print.thres.cex = 1.2, col = "red", legacy.axes = TRUE, print.thres.col = "red") plot.roc(rpartFactorRoc, type = "s", add = TRUE, print.thres = c(.5), print.thres.pch = 16, legacy.axes = TRUE, print.thres.pattern = "", print.thres.cex = 1.2) legend(.75, .2, c("Grouped Categories", "Independent Categories"), lwd = c(1, 1), col = c("black", "red"), pch = c(16, 3)) ``` # Bagging Tree Model ```{r} # load and prepare data dat <- read.csv("http://bit.ly/2P5gTw4") # use the 10 survey questions as predictors trainx <- dat[, grep("Q", names(dat))] # add segment as a predictor # don't need to encode it to dummy variables trainx$segment <- as.factor(dat$segment) # use gender as the response variable trainy <- as.factor(dat$gender) ``` ```{r} # using 200 trees to do bagging, no tuning parameters for this method set.seed(100) bagTune <- caret::train(trainx, trainy, method = "treebag", nbagg = 200, metric = "ROC", trControl = trainControl(method = "cv", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE) ) ``` ```{r} bagTune ``` # Random Forest ```{r} # tune across a list of numbers of predictors mtryValues <- c(1:5) # train and tune random forest models set.seed(100) rfTune <- train(x = trainx, y = trainy, # set the model to be random forest method = "rf", ntree = 200, tuneGrid = data.frame(.mtry = mtryValues), importance = TRUE, metric = "ROC", trControl = trainControl(method = "cv", summaryFunction = twoClassSummary, classProbs = TRUE, savePredictions = TRUE)) ``` ```{r} rfTune ``` ```{r} # fit model with best option rfit = randomForest(trainy ~ ., trainx, mtry = 5, ntree = 2000) ``` ```{r} # print relative importance of each variable importance(rfit) ``` ```{r} # plot the relative importance of each variable varImpPlot(rfit) ``` # Gradient Boosted Tree ```{r} # setup tuning grid gbmGrid <- expand.grid(interaction.depth = c(1, 5, 9), n.trees = c(6:10)*10, shrinkage = c(.01, .1), n.minobsinnode = 10) set.seed(100) gbmTune <- caret::train(x = trainx, y = trainy, method = "gbm", tuneGrid = gbmGrid, metric = "accuracy", verbose = FALSE, trControl = trainControl(method = "cv", classProbs = TRUE, savePredictions = TRUE)) ``` ```{r} gbmTune ``` # Comparing ROC Curves Now, let us plot ROC curve for different methods in the same graph. ```{r} treebagRoc <- pROC::roc(response = bagTune$pred$obs, predictor = bagTune$pred$Female, levels = rev(levels(bagTune$pred$obs))) treebagRoc ``` ```{r} rfRoc <- pROC::roc(response = rfTune$pred$obs, predictor = rfTune$pred$Female, levels = rev(levels(rfTune$pred$obs))) rfRoc ``` ```{r} gbmRoc <- pROC::roc(response = gbmTune$pred$obs, predictor = gbmTune$pred$Female, levels = rev(levels(gbmTune$pred$obs))) gbmRoc ``` ```{r} plot.roc(rpartRoc, type = "s", print.thres = c(.5), print.thres.pch = 16, print.thres.pattern = "", print.thres.cex = 1.2, col = "black", legacy.axes = TRUE, print.thres.col = "black") plot.roc(treebagRoc, type = "s", add = TRUE, print.thres = c(.5), print.thres.pch = 3, legacy.axes = TRUE, print.thres.pattern = "", print.thres.cex = 1.2, col = "red", print.thres.col = "red") plot.roc(rfRoc, type = "s", add = TRUE, print.thres = c(.5), print.thres.pch = 1, legacy.axes = TRUE, print.thres.pattern = "", print.thres.cex = 1.2, col = "green", print.thres.col = "green") plot.roc(gbmRoc, type = "s", add = TRUE, print.thres = c(.5), print.thres.pch = 10, legacy.axes = TRUE, print.thres.pattern = "", print.thres.cex = 1.2, col = "blue", print.thres.col = "blue") legend("topleft", cex = 0.8, c("Single Tree", "Bagged Tree", "Random Forest", "Boosted Tree"), lwd = c(1, 1, 1, 1), col = c("black", "red", "green", "blue"), pch = c(16, 3, 1, 10)) ``` Since the data here doesn’t have many variables, we don’t see a significant difference among the models. But you can still see those ensemble methods are better than a single tree. In most of the real applications, ensemble methods perform much better. Random forest and boosting trees can be a baseline model. Before exploring different models, you can quickly run a random forest to see the performance and then try to improve that performance. If the performance you got from the random forest is not too much better than guessing, you should consider collecting more data or reviewing the problem to frame it a different way instead of trying other models. Because it usually means the current data is not enough to solve the problem.