################################################################################
### Title: "Week 12 - Decision Trees"
### Course: STA 235H
### Semester: Fall 2022
### Professor: Magdalena Bennett
################################################################################

# Clears memory
rm(list = ls())
# Clears console
cat("\014")

### Load libraries
# If you don't have one of these packages installed already, you will need to 
#run install.packages() line
library(tidyverse)
library(estimatr)
library(modelr)
library(caret)
library(rpart)
library(rattle)

################################################################################
################ Measuring churn ###############################################

hbo = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week12/1_DecisionTrees/data/hbomax2.csv")

head(hbo)

#In this case, I have already provided the training dataset:
train.data = hbo %>% filter(train==1)
test.data = hbo %>% filter(train==0)


###############################################################################
##### Classification tree
###############################################################################

set.seed(100)

train.control = trainControl(method = "cv", number = 10) #set up a 10-fold cv

ct = train(factor(unsubscribe) ~ . - id, data = train.data,
           method = "rpart", 
           tuneLength = 15, # This tells the model to choose 15 values of `cp` and test them (this is a good starting point, but tuneGrid is better!)
           trControl = train.control)

# What is our best complexity parameter in this case?
ct$bestTune

# We can also see the plot here:
plot(ct)

# Let's see the final classification tree:
ct$finalModel

# Question: Which one is your most predictive covariate here?

#We can also plot this in a prettier way using the rattle package:
fancyRpartPlot(ct$finalModel, caption = "Classification Tree")

# Question: How would you classify someone who hasn't watched Succession and has more than one login in the previous week?

# Finally, we can also estimate the accuracy of this model, the same way we did it for our classification task in ridge or lasso:

pred.unsubscribe = ct %>% predict(test.data)

mean(pred.unsubscribe == test.data$unsubscribe)

# Exercise: Instead of using tuneLength = 15, use tuneGrid = expand.grid(cp = seq(0,0.015, length = 50)) and see what you get!

############################################
#### Regression Tree
############################################

# Let's now complete the same task as before, but using logins as our outcome. Also, let's add some more stuff!
set.seed(100)

train.control = trainControl(method = "cv", number = 10) #set up a 10-fold cv

rt = train(logins ~ . - unsubscribe - id, data = train.data,
            method = "rpart", 
            tuneGrid = expand.grid(cp = seq(0, 0.01, length = 50)), # This tells the model to test 50 (equally-spaced) values of cp between 0 and 0.01
            trControl = train.control)

# What is our best complexity parameter in this case?
rt$bestTune

# We can also see the plot here:
plot(rt)

# Let's see our regression tree
fancyRpartPlot(rt$finalModel, caption = "Regression Tree")

# Now, we can estimate the RMSE of this model as well:

rmse(rt, test.data)

# Exercise: Compare this model with lasso or ridge regression from the previous class. Which one does better?

# Finally, we can also add other stopping conditions to make sure our leaves don't get too small:

set.seed(100)

rt = train(logins ~ . - unsubscribe - id, data = train.data,
            method = "rpart", 
            tuneGrid = expand.grid(cp = seq(0, 0.01, length = 70)), # Just for fun, I changed the length of this vector to 70
            trControl = train.control,
            control = rpart.control(minsplit = 15)) # This tells the model that there needs to be at least 15 observations in each leave for it to attempt a split!

# What is our best complexity parameter in this case?
rt$bestTune