--- title: "Chapter 7 Model Tuning Strategy" date: "`r Sys.Date()`" output: html_document: toc: true toc_depth: 3 toc_float: collapsed: true smooth_scroll: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) ``` In this notebook, we illustrate basic model tuning strategies. # Load R Packages ```{r, message = FALSE, warning=FALSE, results='hide'} # install packages from CRAN p_needed <- c('ggplot2','tidyr', 'caret', 'dplyr', 'lattice', 'proxy', 'caret') packages <- rownames(installed.packages()) p_to_install <- p_needed[!(p_needed %in% packages)] if (length(p_to_install) > 0) { install.packages(p_to_install) } lapply(p_needed, require, character.only = TRUE) ``` # Variance-Bias Trade-Off In this section, we will show the variance-bias trade-off through a simulated example. We first simulate a dataset and plot the data. ```{r} # randomly simulate some non-linear samples x = seq(1, 10, 0.01) * pi e = rnorm(length(x), mean = 0, sd = 0.2) fx <- sin(x) + e + sqrt(x) dat = data.frame(x, fx) ``` If we fit a linear regression model to this dataset, it will be a high bias model. ```{r} ggplot(dat, aes(x, fx)) + geom_point() + geom_smooth(method = "lm", se = FALSE) ``` If we use a smoothing method to fit the data, it will be a high variance model. ```{r} ggplot(dat, aes(x, fx)) + geom_smooth(span = 0.03) ``` To better illustrate the variance-bias trade-off, we now simulate four subsets of the original dataset and try to fit a smoothing model and a linear model. As the plots show, the smoothing model is more flexible to the training data, while the linear model is more robust to different subsets. ```{r} # set random seed set.seed(2016) # sample part of the data to fit model sample 1 idx1 = sample(1:length(x), 100) dat1 = data.frame(x1 = x[idx1], fx1 = fx[idx1]) p1 = ggplot(dat1, aes(x1, fx1)) + geom_smooth(span = 0.03) + geom_point() # sample 2 idx2 = sample(1:length(x), 100) dat2 = data.frame(x2 = x[idx2], fx2 = fx[idx2]) p2 = ggplot(dat2, aes(x2, fx2)) + geom_smooth(span = 0.03) + geom_point() # sample 3 idx3 = sample(1:length(x), 100) dat3 = data.frame(x3 = x[idx3], fx3 = fx[idx3]) p3 = ggplot(dat3, aes(x3, fx3)) + geom_smooth(span = 0.03) + geom_point() # sample 4 idx4 = sample(1:length(x), 100) dat4 = data.frame(x4 = x[idx4], fx4 = fx[idx4]) p4 = ggplot(dat4, aes(x4, fx4)) + geom_smooth(span = 0.03) + geom_point() # Load multiplot function source('http://bit.ly/2KeEIg9') multiplot(p1, p2, p3, p4, cols = 2) ``` ```{r} p1 = ggplot(dat1, aes(x1, fx1)) + geom_smooth(method = "lm", se = FALSE) + geom_point() p2 = ggplot(dat2, aes(x2, fx2)) + geom_smooth(method = "lm", se = FALSE) + geom_point() p3 = ggplot(dat3, aes(x3, fx3)) + geom_smooth(method = "lm", se = FALSE) + geom_point() p4 = ggplot(dat4, aes(x4, fx4)) + geom_smooth(method = "lm", se = FALSE) + geom_point() multiplot(p1, p2, p3, p4, cols = 2) ``` # Data Splitting and Resampling Data splitting and resampling are fundamental techniques to build sound models for prediction. ## Data Splitting The `createDataPartition()` function in `caret` package will return a balanced splitting based on assigned variable. ```{r} # load data sim.dat <- read.csv("http://bit.ly/2P5gTw4") # set random seed to make sure reproducibility set.seed(3456) trainIndex <- createDataPartition(sim.dat$segment, p = 0.8, list = FALSE, times = 1) head(trainIndex) ``` ```{r} # get training set datTrain <- sim.dat[trainIndex, ] # get test set datTest <- sim.dat[-trainIndex, ] ``` Let's check the distribution of the two sets: ```{r, message=FALSE} datTrain %>% dplyr::group_by(segment) %>% dplyr::summarise(count = n(), percentage = round(length(segment)/nrow(datTrain), 2)) ``` ```{r, message=FALSE} datTest %>% dplyr::group_by(segment) %>% dplyr::summarise(count = n(), percentage = round(length(segment)/nrow(datTrain), 2)) ``` An alternative way is to split data based on the predictors. The goal is to get a diverse subset from a dataset so that the sample is representative. ```{r} # select variables testing <- subset(sim.dat, select = c("age", "income")) ``` Random select 5 samples as initial subset (`start`) , the rest will be in `samplePool`: ```{r} set.seed(5) # select 5 random samples startSet <- sample(1:dim(testing)[1], 5) start <- testing[startSet, ] # save the rest in data frame 'samplePool' samplePool <- testing[-startSet, ] ``` Use `maxDissim()` to select another 5 samples from `samplePool` that are as different as possible with the initical set `start`: ```{r} selectId <- maxDissim(start, samplePool, obj = minDiss, n = 5) minDissSet <- samplePool[selectId, ] ``` ```{r} selectId <- sample(1:dim(samplePool)[1], 5) RandomSet <- samplePool[selectId, ] ``` ```{r} start$group <- rep("Initial Set", nrow(start)) minDissSet$group <- rep("Maximum Dissimilarity Sampling", nrow(minDissSet)) RandomSet$group <- rep("Random Sampling", nrow(RandomSet)) xyplot(age ~ income, data = rbind(start, minDissSet, RandomSet), grid = TRUE, group = group, auto.key = TRUE) ``` For time series data, random sampling is usually not the best way. ```{r} # simulte AR(1) time series samples timedata = arima.sim(list(order=c(1,0,0), ar=-.9), n=100) # plot time series plot(timedata, main=(expression(AR(1)~~~phi==-.9))) ``` ```{r} timeSlices <- createTimeSlices(1:length(timedata), initialWindow = 36, horizon = 12, fixedWindow = T) str(timeSlices,max.level = 1) ``` ```{r} # get result for the 1st training set trainSlices <- timeSlices[[1]] # get result for the 1st test set testSlices <- timeSlices[[2]] # check the index for the 1st training and test set trainSlices[[1]] ``` ```{r} testSlices[[1]] ``` ## Resampling k-fold cross-validation is to partition the original sample into 𝑘 equal size subsamples (folds). ```{r} class <- sim.dat$segment # creat k-folds set.seed(1) cv <- createFolds(class, k = 10, returnTrain = T) str(cv) ``` Repeated training/test splits method is repeating the training/test set division on the original data. ```{r} trainIndex <- createDataPartition(sim.dat$segment, p = .8, list = FALSE, times = 5) dplyr::glimpse(trainIndex) ```