---
title: "Chapter 9 Regression Models"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: true
      smooth_scroll: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

In this notebook, we introduce regression models including linear regression, principal component regression and partial least square.

# Load R Packages

```{r, message = FALSE, warning=FALSE, results='hide'}
# install packages from CRAN
p_needed <- c('caret', 'dplyr', 'lattice',
              'elasticnet', 'lars', 'corrplot', 
              'pls')

packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
    install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)
```

# Linear Regression

Multivariate linear regression (i.e., the typical least square regression) is one of the simplest supervised learning methods.

```{r}
# Let us use the expenditure data and apply basic data pre-processing method of removing negative expense.

dat <- read.csv("http://bit.ly/2P5gTw4")
dat <- subset(dat, store_exp > 0 & online_exp > 0)
```

```{r}
# response variable: the sum of the expenses 
# explanatory variables: 10 survey questions

modeldat <- dat[, grep("Q", names(dat))]
modeldat$total_exp <- dat$store_exp + dat$online_exp
```

Before fitting a linear regression, let us do a few quick checks, including identifying outliers and highly correlated explanatory variables.


```{r}
par(mfrow = c(1, 2))
hist(modeldat$total_exp, main = "", xlab = "total_exp")
boxplot(modeldat$total_exp)
```


```{r}
# Then let us remove outliers using the z-score method

y <- modeldat$total_exp
# Find data points with Z-score larger than 3.5
zs <- (y - mean(y))/mad(y)
modeldat <- modeldat[-which(zs > 3.5), ]
```


```{r}
# Now let us check correlation among input features.

correlation <- cor(modeldat[, grep("Q", names(modeldat))])
corrplot.mixed(correlation, order = "hclust", tl.pos = "lt", 
    upper = "ellipse")
```


```{r}
# remove highly correlated variables using findCorrelation() function in caret package.

highcor <- findCorrelation(correlation, cutoff = 0.75)
modeldat <- modeldat[, -highcor]
```

Now we are ready to fit a linear regression model. For this case, a log transformation of `total_exp` is applied before fitting the regression model.

```{r}
lmfit <- lm(log(total_exp) ~ ., data = modeldat)
summary(lmfit)
```

```{r}
# Find confidence interval for the model parameters:
confint(lmfit,level=0.9)
```

After the model fitting, we need to check typical diagnostic plots and identify potential outliers.

```{r}
par(mfrow = c(2, 2))
plot(lmfit, which = 1)
plot(lmfit, which = 2)
plot(lmfit, which = 3)
plot(lmfit, which = 4)
```

```{r}
# We see three outliers after model fitting
modeldat[which(row.names(modeldat) %in% c(960, 678, 155)), ]
```

```{r}
# To see why these are outliers, we can check data points match each outlier, for example using the last outlier described above.
datcheck = modeldat %>% filter(Q1 ==2 & Q2 == 1 & Q3 == 1 & Q6 == 1 & Q8 == 3) 
nrow(datcheck)
```

```{r}
# It is now easy to see why it is an outlier, as all the other 86 records with the same survey responses have much higher total expenses.
summary(datcheck$total_exp)
```

# Principal Component Regression and Partial Least Square Regression

In addition to remove highly correlated explanatory variables, we can also try principal component regression to use selected first a few principal components as model input. Principal components analysis is unsupervised, and the corresponding supervised version is partial least square regression. In this section, we will use the expenditure data again to illustrate these two methods.

First, let us load the pre-process the data.

```{r}
# Load Data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
ymad <- mad(na.omit(sim.dat$income))

# Calculate Z values
zs <- (sim.dat$income - mean(na.omit(sim.dat$income)))/ymad
# which(na.omit(zs>3.5)) find outlier 
# which(is.na(zs)) find missing values
idex <- c(which(na.omit(zs > 3.5)), which(is.na(zs)))
# Remove rows with outlier and missing values
sim.dat <- sim.dat[-idex, ]
```

```{r}
xtrain = dplyr::select(sim.dat, Q1:Q10)
ytrain = sim.dat$income
```

```{r}
set.seed(100)
ctrl <- trainControl(method = "cv", number = 10)
```

The following code is for Principal Component Regression.

```{r}
# Set random seed
 set.seed(100)
 pcrTune <- train(x = xtrain, y = ytrain,
          method = "pcr",
          # set hyper-parameter tuning range
          tuneGrid = expand.grid(.ncomp = 1:10),
          trControl = ctrl)
 pcrTune
```

The following code is for Partial Least Square Regression.

```{r}
plsTune <- train(xtrain, ytrain, 
                 method = "pls", 
                 # set hyper-parameter tuning range
                 tuneGrid = expand.grid(.ncomp = 1:10),
                 trControl = ctrl)
plsTune
```


```{r}
plsImp <- varImp(plsTune, scale = FALSE)
plot(plsImp, top = 10, scales = list(y = list(cex = 0.95)))
```

The plot below shows the performance comparsion for these two methods as a function of number of components.


```{r}
# Save PLS model tuning information to plsResamples
plsResamples <- plsTune$results
plsResamples$Model <- "PLS"

# Save PCR model tuning information to plsResamples
pcrResamples <- pcrTune$results
pcrResamples$Model <- "PCR"

# Combine both output for plotting
plsPlotData <- rbind(plsResamples, pcrResamples)

# Leverage xyplot() function from lattice library
xyplot(RMSE ~ ncomp, data = plsPlotData, xlab = "# Components", 
    ylab = "RMSE (Cross-Validation)", auto.key = list(columns = 2), 
    groups = Model, type = c("o", "g"))
```