--- title: "Chapter 9 Regression Models" date: "`r Sys.Date()`" output: html_document: toc: true toc_depth: 3 toc_float: collapsed: true smooth_scroll: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) ``` In this notebook, we introduce regression models including linear regression, principal component regression and partial least square. # Load R Packages ```{r, message = FALSE, warning=FALSE, results='hide'} # install packages from CRAN p_needed <- c('caret', 'dplyr', 'lattice', 'elasticnet', 'lars', 'corrplot', 'pls') packages <- rownames(installed.packages()) p_to_install <- p_needed[!(p_needed %in% packages)] if (length(p_to_install) > 0) { install.packages(p_to_install) } lapply(p_needed, require, character.only = TRUE) ``` # Linear Regression Multivariate linear regression (i.e., the typical least square regression) is one of the simplest supervised learning methods. ```{r} # Let us use the expenditure data and apply basic data pre-processing method of removing negative expense. dat <- read.csv("http://bit.ly/2P5gTw4") dat <- subset(dat, store_exp > 0 & online_exp > 0) ``` ```{r} # response variable: the sum of the expenses # explanatory variables: 10 survey questions modeldat <- dat[, grep("Q", names(dat))] modeldat$total_exp <- dat$store_exp + dat$online_exp ``` Before fitting a linear regression, let us do a few quick checks, including identifying outliers and highly correlated explanatory variables. ```{r} par(mfrow = c(1, 2)) hist(modeldat$total_exp, main = "", xlab = "total_exp") boxplot(modeldat$total_exp) ``` ```{r} # Then let us remove outliers using the z-score method y <- modeldat$total_exp # Find data points with Z-score larger than 3.5 zs <- (y - mean(y))/mad(y) modeldat <- modeldat[-which(zs > 3.5), ] ``` ```{r} # Now let us check correlation among input features. correlation <- cor(modeldat[, grep("Q", names(modeldat))]) corrplot.mixed(correlation, order = "hclust", tl.pos = "lt", upper = "ellipse") ``` ```{r} # remove highly correlated variables using findCorrelation() function in caret package. highcor <- findCorrelation(correlation, cutoff = 0.75) modeldat <- modeldat[, -highcor] ``` Now we are ready to fit a linear regression model. For this case, a log transformation of `total_exp` is applied before fitting the regression model. ```{r} lmfit <- lm(log(total_exp) ~ ., data = modeldat) summary(lmfit) ``` ```{r} # Find confidence interval for the model parameters: confint(lmfit,level=0.9) ``` After the model fitting, we need to check typical diagnostic plots and identify potential outliers. ```{r} par(mfrow = c(2, 2)) plot(lmfit, which = 1) plot(lmfit, which = 2) plot(lmfit, which = 3) plot(lmfit, which = 4) ``` ```{r} # We see three outliers after model fitting modeldat[which(row.names(modeldat) %in% c(960, 678, 155)), ] ``` ```{r} # To see why these are outliers, we can check data points match each outlier, for example using the last outlier described above. datcheck = modeldat %>% filter(Q1 ==2 & Q2 == 1 & Q3 == 1 & Q6 == 1 & Q8 == 3) nrow(datcheck) ``` ```{r} # It is now easy to see why it is an outlier, as all the other 86 records with the same survey responses have much higher total expenses. summary(datcheck$total_exp) ``` # Principal Component Regression and Partial Least Square Regression In addition to remove highly correlated explanatory variables, we can also try principal component regression to use selected first a few principal components as model input. Principal components analysis is unsupervised, and the corresponding supervised version is partial least square regression. In this section, we will use the expenditure data again to illustrate these two methods. First, let us load the pre-process the data. ```{r} # Load Data sim.dat <- read.csv("http://bit.ly/2P5gTw4") ymad <- mad(na.omit(sim.dat$income)) # Calculate Z values zs <- (sim.dat$income - mean(na.omit(sim.dat$income)))/ymad # which(na.omit(zs>3.5)) find outlier # which(is.na(zs)) find missing values idex <- c(which(na.omit(zs > 3.5)), which(is.na(zs))) # Remove rows with outlier and missing values sim.dat <- sim.dat[-idex, ] ``` ```{r} xtrain = dplyr::select(sim.dat, Q1:Q10) ytrain = sim.dat$income ``` ```{r} set.seed(100) ctrl <- trainControl(method = "cv", number = 10) ``` The following code is for Principal Component Regression. ```{r} # Set random seed set.seed(100) pcrTune <- train(x = xtrain, y = ytrain, method = "pcr", # set hyper-parameter tuning range tuneGrid = expand.grid(.ncomp = 1:10), trControl = ctrl) pcrTune ``` The following code is for Partial Least Square Regression. ```{r} plsTune <- train(xtrain, ytrain, method = "pls", # set hyper-parameter tuning range tuneGrid = expand.grid(.ncomp = 1:10), trControl = ctrl) plsTune ``` ```{r} plsImp <- varImp(plsTune, scale = FALSE) plot(plsImp, top = 10, scales = list(y = list(cex = 0.95))) ``` The plot below shows the performance comparsion for these two methods as a function of number of components. ```{r} # Save PLS model tuning information to plsResamples plsResamples <- plsTune$results plsResamples$Model <- "PLS" # Save PCR model tuning information to plsResamples pcrResamples <- pcrTune$results pcrResamples$Model <- "PCR" # Combine both output for plotting plsPlotData <- rbind(plsResamples, pcrResamples) # Leverage xyplot() function from lattice library xyplot(RMSE ~ ncomp, data = plsPlotData, xlab = "# Components", ylab = "RMSE (Cross-Validation)", auto.key = list(columns = 2), groups = Model, type = c("o", "g")) ```