################################################################################ ### Title: "Week 13 - Bagging, Random Forests, and Boosting" ### Course: STA 235H ### Semester: Fall 2023 ### Professor: Magdalena Bennett ################################################################################ # Clears memory rm(list = ls()) # Clears console cat("\014") ### Load libraries # If you don't have one of these packages installed already, you will need to #run install.packages() line library(tidyverse) library(caret) library(modelr) library(rpart) library(rattle) library(rsample) #This helps us divide our data by strata #install.packages("xgboost") # You will need to install xgboost. For that, you need to have RTools installed on your computer first if you use Windows. Check this out if you have Windows: https://www.rdocumentation.org/packages/installr/versions/0.22.0/topics/install.Rtools # After that, install xgboost to R # If installing it from CRAN fails, you can also install it by downloading the file and going to "Tools --> Install Packages" using the file downloaded from here: https://cran.r-project.org/src/contrib/Archive/xgboost/ #################### ## Car seats sale ################### #library(ISLR) #data("Carseats") # Load the data from the ISLR package (this is the package for the book we are using!) Carseats = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week13/1_RandomForests/data/Carseats.csv") head(Carseats) # Explore the data set.seed(100) split = initial_split(Carseats, prop = 0.7, strata = "Sales") carseats.train = training(split) carseats.test = testing(split) tuneGrid = expand.grid(cp = seq(0, 0.015, length = 100)) # we set this tuneGrid after looking at the plot # Excercise: With which tuneGrid would you begin? mcv = train(Sales ~., data = carseats.train, method = "rpart", trControl = trainControl("cv", number = 10), tuneGrid = tuneGrid) fancyRpartPlot(mcv$finalModel) #This is the best way to plot the tree out of the caret package # We can also look at the most important covariates, based on that tree: varImp(mcv, scale = TRUE) plot(varImp(mcv, scale = TRUE)) # Question: Why isn't necessarily the most important covariate the one at the top of the tree in this plot? # The variable at the top of the tree is the *single* variable that reduces the RMSE the most, but # there are additional splits where variables also play a role. varImp() consider all these different splits. #### Bagging # How do these compare to the importance of the covariates for the bagged trees? Let's see set.seed(100) # We now ran bagged trees using the caret package bt = train(Sales ~ ., data = carseats.train, method = "treebag", trControl = trainControl("cv", number = 10), nbagg = 100, control = rpart.control(cp = 0)) # Why do we set cp = 0? How large is a tree if the complexity parameter is 0? plot(varImp(bt, scale = TRUE)) #Importance of the covariates! # We can get the RMSE the same as always: rmse(bt, carseats.test) # And we can also get the predicted sales! pred.sales = bt %>% predict(carseats.test) ###### Random forests set.seed(100) # We have a lot of parameters! (we can also use tuneLength as an overall default parameter) tuneGrid = expand.grid( mtry = 1:11, # Number of random covariates that will test (this should be the number of max predictors you have in your data, if you want to test all) splitrule = "variance", # Split rule (IMPORTANT: for regressions use "variance", for classification use "gini") min.node.size = 5 # You can vary this (e.g. include c(5,10) if you want to test different min node sizes) or just live it out ) rfcv = train(Sales ~ ., data = carseats.train, method = "ranger", # You can also use "rf", but "ranger" is faster! trControl = trainControl("cv", number = 10), importance = "permutation", tuneGrid = tuneGrid) # Question: Change tuneGrid = tuneGrid to tuneLength = 10, what do you get? rfcv # Here you can see the main results! # We can plot the tuning parameter we were testing (mtry only in this case) here: plot(rfcv) # Question: How many trees is this RF running? You can see it here: rfcv$finalModel # See results here #### Run faster # One of the problems with random forests or other methods, is that they can be quite slow. Can we speed them up? # Yes! # We can parallelize some of the process. For that, we'll need a couple of new packages: library(parallel) library(doParallel) cl = makePSOCKcluster(detectCores() - 1) # detectCores() will detect the number of cores from your computer. convention to leave 1 core for OS registerDoParallel(cl) # Let's run our RF (and let's measure how long it takes!): system.time(rfcv = train(Sales ~ ., data = carseats.train, method = "ranger", trControl = trainControl("cv", number = 10, allowParallel = TRUE), # We need to include the allowParallel argument tuneLength = 10) ) stopCluster(cl) # We need to close the cluster once we are done registerDoSEQ() # We also need to unregister the backend # The parallel RF takes like 12s in my computer. Let's see how much it takes the original RF without parallelizing: system.time(rfcv = train(Sales ~ ., data = carseats.train, method = "ranger", trControl = trainControl("cv", number = 10), tuneLength = 10) ) # This takes around 30s, so using a parallel approach could save you a bunch of time! #### Boosting # Now, we are going to run some boosting set.seed(100) gbm = train(Sales ~ ., data = carseats.train, method = "gbm", # We are using gradient boosting trControl = trainControl("cv", number = 10), tuneLength = 20) # Play around with this parameter! # Final Model information gbm$finalModel # Best Tuning parameters? gbm$bestTune # You can also try extreme gradient boosting, which is more efficient (and faster!) # If you don't want to run it in a parallel way, just comment out lines 167-168 and 176-177, and remove the option "allowParallel" cl = makePSOCKcluster(detectCores()-1) registerDoParallel(cl) set.seed(100) xgbm = train(Sales ~ ., data = carseats.train, method = 'xgbTree', # We are using extreme gradient boosting trControl = trainControl("cv", number = 10, allowParallel = TRUE)) stopCluster(cl) registerDoSEQ() # Final Model information xgbm$finalModel # Best Tuning parameters? xgbm$bestTune # Exercise: Compare all the RMSE for the testing dataset with all the different models we have fit. Which one is better? ######################################################################### #### Classification problem ## Exercise: repeat the same models as before, but now use `HighSales` as your outcome. What do you get? Carseats = Carseats %>% mutate(HighSales = ifelse(Carseats$Sales<=8, 0, 1)) # Build a binary outcome set.seed(100) split = initial_split(Carseats, prop = 0.7, strata = "HighSales") #Create a new training and testing dataset carseats.train = training(split) carseats.test = testing(split) # Exercise: 1) Find the best pruned decision tree, 2) Run some bagged trees, 3) Run a random forest, and 4) run a boosted model. # You can do all that adapting the code above! # I will help you with some boosting code: set.seed(100) gbm = train(factor(HighSales) ~ . - Sales, data = carseats.train, method = "gbm", #run gradient boosting trControl = trainControl("cv", number = 10), tuneLength = 10) #You can play around with this. For example, change it to 15 or 20. What do you get? # Gradient boosting pred.values.boost = gbm %>% predict(carseats.test) mean(pred.values.boost == carseats.test$HighSales)