################################################################################
### Title: "Week 10 - Prediction I"
### Course: STA 235H
### Semester: Fall 2023
### Professor: Magdalena Bennett
################################################################################
# Clears memory
rm(list = ls())
# Clears console
cat("\014")
### Load libraries
# If you don't have one of these packages installed already, you will need to
#run install.packages() line
library(tidyverse)
library(estimatr)
library(modelr)
library(caret)
################################################################################
################ Measuring churn ###############################################
hbo = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week10/1_ModelSelection/data/hbomax.csv")
head(hbo)
# Divide data: 80% vs 20% split
set.seed(100) #Always set seed for replication! (and make sure you are running an updated version of R!)
n = nrow(hbo) # Will tell us how many observations we have
train = sample(x = 1:n, size = n*0.8) #randomly select 80% of the rows for our training sample
# slice() selects rows from a dataset based on the row number.
train.data = hbo %>% slice(train) #use only the rows that were selected for training
test.data = hbo %>% slice(-train) #the rest are used for testing
### Simple model
lm_simple = lm(logins ~ succession + city, data = train.data) #Train the model on the TRAINING DATASET
### Complex model
lm_complex = lm(logins ~ female + city + age + I(age^2) + succession, data = train.data) #Train the model on the TRAINING DATASET
# Let's look at the RMSE for these models on the TRAINING dataset:
# For simple model:
rmse(lm_simple, train.data) #rmse() in the `modelr` package takes model we are using and the data.
# For complex model:
rmse(lm_complex, train.data)
# If we wanted to actually get the predictions, we could also do that with the following line:
pred_complex_train = lm_complex %>% predict(train.data) #We start with the model, and then use the function predict on the data we want.
## Question: According to this, which model is better? Is this the comparison we want?
# Estimate RMSE for these models on the TESTING dataset (THIS IS WHAT WE WANT TO ASSESS THE MODEL):
# For simple model:
rmse(lm_simple, test.data)
# For complex model:
rmse(lm_complex, test.data)
## Question: Using this comparison, which model is better? Is this the comparison we want?
####################################################################################
# Prediction for a specific individual:
####################################################################################
#If we wanted to predict for a specific individual, we can create a new dataset and use that with the predict() function.
# Remember to create (at least) all the variables that are in the *model*!
data_new = data.frame(female = 1,
city = 1,
age = 30,
succession = 1,
unsubscribed = 0)
lm_complex %>% predict(data_new)
# Using the complex model, for a female who lives in a city and it's 30 yo and
# has watched succession, we predict 3.3 logins in the past week.
# Q: How many logins you predict using the simple model?
###############################################################################
#### Cross-validation
###############################################################################
# We will typically use 5 or 10-fold cross validation
set.seed(100) # Set seed for replication!
train.control = trainControl(method = "cv", number = 10) #This is a function from the package caret. We are telling our data that we will use a cross validation approach (cv) with 10 folds (number). Use ?trainControl to see the different methods we could use!
lm_simple_cv = train(logins ~ succession + city, data = hbo, method="lm",
trControl = train.control) #See that here (in the train function), we just pass all the data. The function will divide it in folds and do all that!
lm_simple_cv
lm_complex_cv = train(logins ~ female + city + age + I(age^2) + succession, data = hbo, method="lm",
trControl = train.control) #See that here (in the train function), we just pass all the data. The function will divide it in folds and do all that!
lm_complex_cv
# Question: Looking at both models, which one would you prefer?
################################################################################
##### Stepwise
################################################################################
set.seed(100) # Set a seed for replication
train.control = trainControl(method = "cv", number = 10) #set up a 10-fold cv
lm.fwd = train(logins ~ . - unsubscribe, data = train.data, # We take out unsubscribe because it's also an outcome (happens *after* logins)
method = "leapForward", # "leapForward" is for Forward Stepwise and "leapBackward" is for backwards.
tuneGrid = data.frame(nvmax = 1:5), #We include 5 variables, because that's all the predictors we are using for our model.
trControl = train.control)
lm.fwd$results
# We can see the number of covariates that is optimal to choose:
lm.fwd$bestTune
# And how does that model looks like:
summary(lm.fwd$finalModel)
# If we want the RMSE
rmse(lm.fwd, test.data)
# Note: Even if we are doing cross-validation, because we now have a *tuning parameter*
# (nvmax), we still need to train our model in the training data.