## ----------------------------------------------------------------------------
## Name: 03-lm-ii.R
## Description: Script for Chapter 3 of "Notes for Predictive Modeling"
## Link: https://bookdown.org/egarpor/PM-UC3M/
## License: https://creativecommons.org/licenses/by-nc-nd/4.0/
## Author: Eduardo García-Portugués
## Version: 5.10.0
## ----------------------------------------------------------------------------

## ----case2-2, eval = FALSE--------------------------------------------------------------
## # Read data
## Boston <- readxl::read_excel(path = "Boston.xlsx", sheet = 1, col_names = TRUE)
## 
## # # Alternatively
## # data(Boston, package = "MASS")

## ----case2-3----------------------------------------------------------------------------
summary(Boston)

## ----scat2, fig.cap = '(ref:scat2-title)', fig.margin = FALSE---------------------------
car::scatterplotMatrix(~ crim + dis + medv + nox + rm, regLine = list(col = 2),
                       col = 1, smooth = list(col.smooth = 4, col.spread = 4),
                       data = Boston)

## ----lmitcasesn, results = 'hide'-------------------------------------------------------
# Data: n observations and p = n - 1 predictors
set.seed(123456)
n <- 5
p <- n - 1
df <- data.frame(y = rnorm(n), x = matrix(rnorm(n * p), nrow = n, ncol = p))

# Case p = n - 1 = 4: beta can be estimated, but sigma^2 cannot (hence, no
# inference can be performed since it requires the estimated sigma^2)
summary(lm(y ~ ., data = df))

# Case p = n - 2 = 3: both beta and sigma^2 can be estimated (hence, inference
# can be performed)
summary(lm(y ~ . - x.1, data = df))

## ----case2-4----------------------------------------------------------------------------
# Two models with different predictors
mod1 <- lm(medv ~ age + crim, data = Boston)
mod2 <- lm(medv ~ age + crim + lstat, data = Boston)

# BICs
BIC(mod1)
BIC(mod2) # Smaller -> better

# AICs
AIC(mod1)
AIC(mod2) # Smaller -> better

# Check the summaries
summary(mod1)
summary(mod2)

## ----case2-6, eval = FALSE--------------------------------------------------------------
## # Load data -- notice that "Year" is also included
## wine <- read.csv(file = "wine.csv", header = TRUE)

## ----bic-1------------------------------------------------------------------------------
# Full model
mod <- lm(Price ~ ., data = wine)

# With AIC
modAIC <- MASS::stepAIC(mod, k = 2)

# The result is an lm object
summary(modAIC)

# With BIC
modBIC <- MASS::stepAIC(mod, k = log(nrow(wine)))

## ----bic-2------------------------------------------------------------------------------
# Add an irrelevant predictor to the wine dataset
set.seed(123456)
wineNoise <- wine
n <- nrow(wineNoise)
wineNoise$noisePredictor <- rnorm(n)

# Backward selection: removes predictors sequentially from the given model

# Starting from the model with all the predictors
modAll <- lm(formula = Price ~ ., data = wineNoise)
MASS::stepAIC(modAll, direction = "backward", k = log(n))

# Starting from an intermediate model
modInter <- lm(formula = Price ~ noisePredictor + Year + AGST, data = wineNoise)
MASS::stepAIC(modInter, direction = "backward", k = log(n))
# Recall that other predictors not included in modInter are not explored
# during the search (so the relevant predictor HarvestRain is not added)

# Forward selection: adds predictors sequentially from the given model

# Starting from the model with no predictors, only intercept (denoted as ~ 1)
modZero <- lm(formula = Price ~ 1, data = wineNoise)
MASS::stepAIC(modZero, direction = "forward",
              scope = list(lower = modZero, upper = modAll), k = log(n))
# It is very important to set the scope argument adequately when doing forward
# search! In the scope you have to define the "minimum" (lower) and "maximum"
# (upper) models that contain the set of explorable models. If not provided,
# the maximum model will be taken as the passed starting model (in this case
# modZero) and stepAIC will not do any search

# Starting from an intermediate model
MASS::stepAIC(modInter, direction = "forward",
              scope = list(lower = modZero, upper = modAll), k = log(n))
# Recall that predictors included in modInter are not dropped during the
# search (so the irrelevant noisePredictor is kept)

# Both selection: useful if starting from an intermediate model

# Removes the problems associated to "backward" and "forward" searches done
# from intermediate models
MASS::stepAIC(modInter, direction = "both",
              scope = list(lower = modZero, upper = modAll), k = log(n))
# It is very important as well to correctly define the scope, because "both"
# resorts to "forward" (as well as to "backward")

# Using the defaults from the full model essentially does backward selection,
# but allowing predictors that were removed to enter again at later steps
MASS::stepAIC(modAll, direction = "both", k = log(n))

# Omit lengthy outputs
MASS::stepAIC(modAll, direction = "both", trace = 0,
              scope = list(lower = modZero, upper = modAll), k = log(n))

## ----type-aic---------------------------------------------------------------------------
# Same BICs, different scale
n <- nobs(modBIC)
extractAIC(modBIC, k = log(n))[2]
BIC(modBIC)
# Observe that MASS::stepAIC(mod, k = log(nrow(wine))) returned as final BIC
# the one given by extractAIC(), not by BIC()! But both are equivalent, they
# just differ in a constant shift

# Same AICs, different scale
extractAIC(modAIC, k = 2)[2]
AIC(modAIC)

# The additive constant: BIC() includes it but extractAIC() does not
BIC(modBIC) - extractAIC(modBIC, k = log(n))[2]
n * (log(2 * pi) + 1) + log(n)

# Same for the AIC
AIC(modAIC) - extractAIC(modAIC)[2]
n * (log(2 * pi) + 1) + 2

## ----case2-7----------------------------------------------------------------------------
modHouse <- lm(medv ~ ., data = Boston)
summary(modHouse)

## ----case2-8----------------------------------------------------------------------------
# Best BIC and AIC models
modBIC <- MASS::stepAIC(modHouse, k = log(nrow(Boston)))
modAIC <- MASS::stepAIC(modHouse, trace = 0, k = 2)

# Comparison: same fits
car::compareCoefs(modBIC, modAIC)
summary(modBIC)

# Confidence intervals
confint(modBIC)

## ----qua--------------------------------------------------------------------------------
# iris dataset -- factors in the last column
summary(iris)

# Summary of a linear model
mod1 <- lm(Sepal.Length ~ ., data = iris)
summary(mod1)
# Speciesversicolor (D1) coefficient: -0.72356. The average increment of
# Sepal.Length when the species is versicolor instead of setosa (reference)
# Speciesvirginica (D2) coefficient: -1.02350. The average increment of
# Sepal.Length when the species is virginica instead of setosa (reference)
# Both dummy variables are significant

# How to set a different level as reference (versicolor)
iris$Species <- relevel(iris$Species, ref = "versicolor")

# Same estimates, except for the dummy coefficients
mod2 <- lm(Sepal.Length ~ ., data = iris)
summary(mod2)
# Speciessetosa (D1) coefficient: 0.72356. The average increment of
# Sepal.Length when the species is setosa instead of versicolor (reference)
# Speciesvirginica (D2) coefficient: -0.29994. The average increment of
# Sepal.Length when the species is virginica instead of versicolor (reference)
# Both dummy variables are significant

# Coefficients of the model
confint(mod2)
# The coefficients of Speciessetosa and Speciesvirginica are
# significantly positive and negative, respectively

# Show the dummy variables employed for encoding a factor
contrasts(iris$Species)
iris$Species <- relevel(iris$Species, ref = "setosa")
contrasts(iris$Species)

## ----case2-9----------------------------------------------------------------------------
# Load the Boston dataset
data(Boston, package = "MASS")

# Structure of the data
str(Boston)
# chas is a dummy variable measuring if the suburb is close to the river (1)
# or not (0). In this case it is not codified as a factor but as a 0 or 1
# (so it is already dummified)

# Summary of a linear model
mod <- lm(medv ~ chas + crim, data = Boston)
summary(mod)
# The coefficient associated to chas is 5.57772. That means that if the suburb
# is close to the river, the mean of medv increases in 5.57772 units for
# the same house and neighborhood conditions
# chas is significant (the presence of the river adds a valuable information
# for explaining medv)

# Summary of the best model in terms of BIC
summary(modBIC)
# The coefficient associated to chas is 2.71871. If the suburb is close to
# the river, the mean of medv increases in 2.71871 units
# chas is significant as well in the presence of more predictors

## ----datatra----------------------------------------------------------------------------
# Data
x <- c(-2, -1.9, -1.7, -1.6, -1.4, -1.3, -1.1, -1, -0.9, -0.7, -0.6,
       -0.4, -0.3, -0.1, 0, 0.1, 0.3, 0.4, 0.6, 0.7, 0.9, 1, 1.1, 1.3,
       1.4, 1.6, 1.7, 1.9, 2, 2.1, 2.3, 2.4, 2.6, 2.7, 2.9, 3, 3.1,
       3.3, 3.4, 3.6, 3.7, 3.9, 4, 4.1, 4.3, 4.4, 4.6, 4.7, 4.9, 5)
y <- c(1.4, 0.4, 2.4, 1.7, 2.4, 0, 0.3, -1, 1.3, 0.2, -0.7, 1.2, -0.1,
       -1.2, -0.1, 1, -1.1, -0.9, 0.1, 0.8, 0, 1.7, 0.3, 0.8, 1.2, 1.1,
       2.5, 1.5, 2, 3.8, 2.4, 2.9, 2.7, 4.2, 5.8, 4.7, 5.3, 4.9, 5.1,
       6.3, 8.6, 8.1, 7.1, 7.9, 8.4, 9.2, 12, 10.5, 8.7, 13.5)

# Data frame (a matrix with column names)
nonLinear <- data.frame(x = x, y = y)

# We create a new column inside nonLinear, called x2, that contains the
# new variable x^2
nonLinear$x2 <- nonLinear$x^2
# If you wish to remove it
# nonLinear$x2 <- NULL

# Regressions
mod1 <- lm(y ~ x, data = nonLinear)
mod2 <- lm(y ~ x2, data = nonLinear)
summary(mod1)
summary(mod2)
# mod2 has a larger R^2. Also notice the intercept is not significative

## ----pol-1-1, fig.cap = '(ref:pol-1-1-title)'-------------------------------------------
x1 <- seq(-1, 1, l = 4)
poly(x = x1, degree = 2, raw = TRUE) # (X, X^2)
poly(x = x1, degree = 2) # By default, it employs orthogonal polynomials

# Depiction of raw polynomials
x <- seq(-1, 1, l = 200)
degree <- 5
matplot(x, poly(x, degree = degree, raw = TRUE), type = "l", lty = 1,
        ylab = expression(x^k))
legend("bottomright", legend = paste("k =", 1:degree), col = 1:degree, lwd = 2)

## ----pol-1-2, fig.cap = '(ref:pol-1-2-title)'-------------------------------------------
# Depiction of orthogonal polynomials
matplot(x, poly(x, degree = degree), type = "l", lty = 1,
        ylab = expression(p[k](x)))
legend("bottomright", legend = paste("k =", 1:degree), col = 1:degree, lwd = 2)

## ----pol-2-1, fig.cap = '(ref:pol-2-1-title)'-------------------------------------------
# Data containing speed (mph) and stopping distance (ft) of cars from 1920
data(cars)
plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)")

# Fit a linear model of dist ~ speed
mod1 <- lm(dist ~ speed, data = cars)
abline(coef = mod1$coefficients, col = 2)

# Quadratic
mod2 <- lm(dist ~ poly(speed, degree = 2), data = cars)
# The fit is not a line, we must look for an alternative approach
d <- seq(0, 25, length.out = 200)
lines(d, predict(mod2, new = data.frame(speed = d)), col = 3)

# Cubic
mod3 <- lm(dist ~ poly(speed, degree = 3), data = cars)
lines(d, predict(mod3, new = data.frame(speed = d)), col = 4)

# 10th order -- overfitting
mod10 <- lm(dist ~ poly(speed, degree = 10), data = cars)
lines(d, predict(mod10, new = data.frame(speed = d)), col = 5)

# BICs -- the linear model is better!
BIC(mod1, mod2, mod3, mod10)

# poly computes by default orthogonal polynomials. These are not
# X^1, X^2, ..., X^p but combinations of them such that the polynomials are
# orthogonal. 'Raw' polynomials are possible with raw = TRUE. They give the
# same fit, but the coefficient estimates are different.
mod2Raw <- lm(dist ~ poly(speed, degree = 2, raw = TRUE), data = cars)
plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)")
lines(d, predict(mod2, new = data.frame(speed = d)), col = 1)
lines(d, predict(mod2Raw, new = data.frame(speed = d)), col = 2)

## ----pol-2-2, fig.cap = '(ref:pol-2-2-title)', fig.show = 'hold'------------------------
# However: different coefficient estimates, but same R^2. How is this possible?
summary(mod2)
summary(mod2Raw)

# Because the predictors in mod2Raw are highly related between them, and
# the ones in mod2 are uncorrelated between them!
car::scatterplotMatrix(mod2$model[, -1], col = 1, regLine = list(col = 2),
                       smooth = list(col.smooth = 4, col.spread = 4))
car::scatterplotMatrix(mod2Raw$model[, -1],col = 1, regLine = list(col = 2),
                       smooth = list(col.smooth = 4, col.spread = 4))
cor(mod2$model[, -1])
cor(mod2Raw$model[, -1])

## ----int-1------------------------------------------------------------------------------
# Interaction between lstat and age
summary(lm(medv ~ lstat + lstat:age, data = Boston))
# For a unit increment in age, the effect of lstat in the response
# increases positively by 0.004103 units, shifting from -1.388161 to -1.384058

# Thus, when age increases makes lstat affect less negatively medv.
# Note that the same interpretation does NOT hold if we switch the roles
# of age and lstat because age is not present as a sole predictor!

# First order interaction
summary(lm(medv ~ lstat * age, data = Boston))

# Second order interaction
summary(lm(medv ~ lstat * age * indus, data = Boston))

## ----int-2------------------------------------------------------------------------------
# Include first-order interactions in the search for the best model in
# terms of BIC, not just single predictors
modIntBIC <- MASS::stepAIC(object = lm(medv ~ ., data = Boston),
                           scope = medv ~ .^2, k = log(nobs(modBIC)), trace = 0)
summary(modIntBIC)

# There is no improvement by removing terms in modIntBIC
MASS::dropterm(modIntBIC, k = log(nobs(modIntBIC)), sorted = TRUE)

# Neither by including other terms interactions
MASS::addterm(modIntBIC, scope = lm(medv ~ .^2, data = Boston),
              k = log(nobs(modIntBIC)), sorted = TRUE)

## ----int-3------------------------------------------------------------------------------
# Group settings
col <- Boston$chas + 3
cex <- 0.5 + 0.25 * Boston$chas

# 1. No dummy variable
(mod1 <- lm(medv ~ lstat, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "1")
abline(coef = mod1$coefficients, lwd = 2)

# 2. Dummy variable
(mod2 <- lm(medv ~ lstat + chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "2")
abline(a = mod2$coefficients[1], b = mod2$coefficients[2], col = 3, lwd = 2)
abline(a = mod2$coefficients[1] + mod2$coefficients[3],
       b = mod2$coefficients[2], col = 4, lwd = 2)

# 3. Dummy variable, with interaction
(mod3 <- lm(medv ~ lstat * chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "3")
abline(a = mod3$coefficients[1], b = mod3$coefficients[2], col = 3, lwd = 2)
abline(a = mod3$coefficients[1] + mod3$coefficients[3],
       b = mod3$coefficients[2] + mod3$coefficients[4], col = 4, lwd = 2)

# 4. Dummy variable only present in interaction
(mod4 <- lm(medv ~ lstat + lstat:chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "4")
abline(a = mod4$coefficients[1], b = mod4$coefficients[2], col = 3, lwd = 2)
abline(a = mod4$coefficients[1],
       b = mod4$coefficients[2] + mod4$coefficients[3], col = 4, lwd = 2)

# 5. Dummy variable and no predictor
(mod5 <- lm(medv ~ chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "5")
abline(a = mod5$coefficients[1], b = 0, col = 3, lwd = 2)
abline(a = mod5$coefficients[1] + mod5$coefficients[2], b = 0, col = 4, lwd = 2)

# 6. Dummy variable. Interaction in the intercept and slope
(mod6 <- lm(medv ~ chas + lstat:chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "6")
abline(a = mod6$coefficients[1], b = 0, col = 3, lwd = 2)
abline(a = mod6$coefficients[1] + mod6$coefficients[2],
       b = mod6$coefficients[3], col = 4, lwd = 2)

# 7. Dummy variable. Interaction in the slope
(mod7 <- lm(medv ~ lstat:chas, data = Boston))
plot(medv ~ lstat, data = Boston, col = col, pch = 16, cex = cex, main = "7")
abline(a = mod7$coefficients[1], b = 0, col = 3, lwd = 2)
abline(a = mod7$coefficients[1], b = mod7$coefficients[2], col = 4, lwd = 2)

## ----int-4------------------------------------------------------------------------------
# Model using a dummy variable in the full dataset
lm(medv ~ lstat + chas + lstat:chas, data = Boston)

# Individual model for the group with chas == 0
lm(medv ~ lstat, data = Boston, subset = chas == 0)
# Notice that the intercept and lstat coefficient are the same as before

# Individual model for the group with chas == 1
lm(medv ~ lstat, data = Boston, subset = chas == 1)
# Notice that the intercept and lstat coefficient equal the ones from the
# joint model, plus the specific terms associated to chas

## ----int-5, fig.cap = '(ref:int-5-title)'-----------------------------------------------
# Does not take into account the groups in the data
modIris <- lm(Sepal.Width ~ Petal.Width, data = iris)
modIris$coefficients

# Adding interactions with the groups
modIrisSpecies <- lm(Sepal.Width ~ Petal.Width * Species, data = iris)
modIrisSpecies$coefficients

# Joint regression line shows negative correlation, but each group
# regression line shows a positive correlation
plot(Sepal.Width ~ Petal.Width, data = iris, col = as.integer(Species) + 1,
     pch = 16)
abline(a = modIris$coefficients[1], b = modIris$coefficients[2], lwd = 2)
abline(a = modIrisSpecies$coefficients[1], b = modIrisSpecies$coefficients[2],
       col = 2, lwd = 2)
abline(a = modIrisSpecies$coefficients[1] + modIrisSpecies$coefficients[3],
       b = modIrisSpecies$coefficients[2] + modIrisSpecies$coefficients[5],
       col = 3, lwd = 2)
abline(a = modIrisSpecies$coefficients[1] + modIrisSpecies$coefficients[4],
       b = modIrisSpecies$coefficients[2] + modIrisSpecies$coefficients[6],
       col = 4, lwd = 2)

## ----case2-11, eval = FALSE-------------------------------------------------------------
## load("wine.RData")
## mod <- lm(Price ~ Age + AGST + HarvestRain + WinterRain, data = wine)
## summary(mod)

## ----diag-1, fig.cap = '(ref:diag-1-title)'---------------------------------------------
plot(mod, 1)

## ----diag-2, fig.cap = '(ref:diag-2-title)'---------------------------------------------
par(mfrow = c(2, 2)) # We have 4 predictors
termplot(mod, partial.resid = TRUE)

## ----diag-3, fig.asp = 1/2, fig.margin = FALSE------------------------------------------
par(mfrow = c(1, 2))
plot(lm(y ~ x, data = nonLinear), 1) # Nonlinear
plot(lm(y ~ I(x^2), data = nonLinear), 1) # Linear

## ----diag-4, fig.cap = '(ref:diag-4-title)'---------------------------------------------
plot(mod, 2)

## ----diag-5-----------------------------------------------------------------------------
# Shapiro-Wilk test of normality
shapiro.test(mod$residuals)
# We do not reject normality
# shapiro.test allows up to 5000 observations -- if dealing with more data
# points, randomization of the input is a possibility

# Lilliefors test -- the Kolmogorov-Smirnov adaptation for testing normality
nortest::lillie.test(mod$residuals)
# We do not reject normality

## ----box-yeo, fig.asp = 1/2-------------------------------------------------------------
# Test data

# Predictors
n <- 200
set.seed(121938)
X1 <- rexp(n, rate = 1 / 5) # Non-negative
X2 <- rchisq(n, df = 5) - 3 # Real

# Response of a linear model
epsilon <- rchisq(n, df = 10) - 10 # Centered error, but not normal
Y <- 10 - 0.5 * X1 + X2 + epsilon

# Transformation of non-normal data to achieve normal-like data (no model)

# Optimal lambda for Box-Cox
BC <- car::powerTransform(lm(X1 ~ 1), family = "bcPower") # Maximum-likelihood fit
# Note we use a regression model with no predictors
(lambdaBC <- BC$lambda) # The optimal lambda
# lambda < 1, so positive skewness is corrected

# Box-Cox transformation
X1Transf <- car::bcPower(U = X1, lambda = lambdaBC)

# Comparison
par(mfrow = c(1, 2))
hist(X1, freq = FALSE, breaks = 10, ylim = c(0, 0.3))
hist(X1Transf, freq = FALSE, breaks = 10, ylim = c(0, 0.3))

# Optimal lambda for Yeo-Johnson
YJ <- car::powerTransform(lm(X2 ~ 1), family = "yjPower")
(lambdaYJ <- YJ$lambda)

# Yeo-Johnson transformation
X2Transf <- car::yjPower(U = X2, lambda = lambdaYJ)

# Comparison
par(mfrow = c(1, 2))
hist(X2, freq = FALSE, breaks = 10, ylim = c(0, 0.3))
hist(X2Transf, freq = FALSE, breaks = 10, ylim = c(0, 0.3))

# Transformation of non-normal response in a linear model

# Optimal lambda for Yeo-Johnson
YJ <- car::powerTransform(lm(Y ~ X1 + X2), family = "yjPower")
(lambdaYJ <- YJ$lambda)

# Yeo-Johnson transformation
YTransf <- car::yjPower(U = Y, lambda = lambdaYJ)

# Comparison for the residuals
par(mfrow = c(1, 2))
plot(lm(Y ~ X1 + X2), 2)
plot(lm(YTransf ~ X1 + X2), 2) # Slightly better

## ----diag-6, fig.cap = '(ref:diag-6-title)'---------------------------------------------
plot(mod, 3)

## ----diag-7-----------------------------------------------------------------------------
# Breusch-Pagan test
car::ncvTest(mod)
# We do not reject homoscedasticity

## ----breusch, fig.cap = '(ref:breusch-title)', fig.show = 'hold'------------------------
# Heteroskedastic models
set.seed(123456)
x <- rnorm(100)
y1 <- 1 + 2 * x + rnorm(100, sd = x^2)
y2 <- 1 + 2 * x + rnorm(100, sd = 1 + x * (x > 0))
modHet1 <- lm(y1 ~ x)
modHet2 <- lm(y2 ~ x)

# Heteroskedasticity not detected
car::ncvTest(modHet1)
plot(modHet1, 3)

# Heteroskedasticity correctly detected
car::ncvTest(modHet2)
plot(modHet2, 3)

## ----diag-8, fig.cap = '(ref:diag-8-title)', fig.show = 'hold'--------------------------
# Artificial data with heteroskedasticity
set.seed(12345)
X <- rchisq(500, df = 3)
e <- rnorm(500, sd = sqrt(0.1 + 2 * X))
Y <- 1 + X + e

# Original
plot(lm(Y ~ X), 3) # Very heteroskedastic

# Transformed
plot(lm(I(log(abs(Y))) ~ X), 3) # Much less hereroskedastic, but at the price
# of losing the signs in Y...

# Shifted and transformed
delta <- 1 # This is tunable
m <- -min(Y) + delta
plot(lm(I(log(Y + m)) ~ X), 3) # No signs loss

# Transformed by Yeo-Johnson

# Optimal lambda for Yeo-Johnson
YJ <- car::powerTransform(lm(Y ~ X), family = "yjPower")
(lambdaYJ <- YJ$lambda)

# Yeo-Johnson transformation
YTransf <- car::yjPower(U = Y, lambda = lambdaYJ)
plot(lm(YTransf ~ X), 3) # Slightly less hereroskedastic

## ----diag-9, fig.cap = '(ref:diag-9-title)'---------------------------------------------
plot(mod$residuals, type = "o")

## ----diag-10----------------------------------------------------------------------------
lag.plot(mod$residuals, lags = 1, do.lines = FALSE)
# No serious serial trend, but some negative autocorrelation is appreciated
cor(mod$residuals[-1], mod$residuals[-length(mod$residuals)])

## ----diag-11----------------------------------------------------------------------------
# Durbin-Watson test
car::durbinWatsonTest(mod)
# Does not reject at alpha = 0.05

## ----multico-1, message = FALSE, fig.cap = '(ref:multico-1-title)'----------------------
# Numerically
round(cor(wine), 2)

# Graphically
corrplot::corrplot(cor(wine), addCoef.col = "grey")

## ----multico-2, fig.cap = '(ref:multico-2-title)'---------------------------------------
# Create predictors with multicollinearity: x4 depends on the rest
set.seed(45678)
x1 <- rnorm(100)
x2 <- 0.5 * x1 + rnorm(100)
x3 <- 0.5 * x2 + rnorm(100)
x4 <- -x1 + x2 + rnorm(100, sd = 0.25)

# Response
y <- 1 + 0.5 * x1 + 2 * x2 - 3 * x3 - x4 + rnorm(100)
data <- data.frame(x1 = x1, x2 = x2, x3 = x3, x4 = x4, y = y)

# Correlations -- none seems suspicious
corrplot::corrplot(cor(data), addCoef.col = "grey")

## ----multico-3--------------------------------------------------------------------------
# Abnormal variance inflation factors: largest for x4, we remove it
modMultiCo <- lm(y ~ x1 + x2 + x3 + x4)
car::vif(modMultiCo)

# Without x4
modClean <- lm(y ~ x1 + x2 + x3)

# Comparison
car::compareCoefs(modMultiCo, modClean)
confint(modMultiCo)
confint(modClean)

# Summaries
summary(modMultiCo)
summary(modClean)

# Variance inflation factors are normal
car::vif(modClean)

## ----outl-1, fig.cap = '(ref:outl-1-title)'---------------------------------------------
plot(mod, 5)

## ----outl-2-----------------------------------------------------------------------------
# Create data
set.seed(12345)
x <- rnorm(100)
e <- rnorm(100, sd = 0.5)
y <- 1 + 2 * x + e

# Leverage expected value
2 / 101 # (p + 1) / n

# Base model
m0 <- lm(y ~ x)
plot(x, y)
abline(coef = m0$coefficients, col = 2)
plot(m0, 5)
summary(m0)

# Make an outlier
x[101] <- 0; y[101] <- 30
m1 <- lm(y ~ x)
plot(x, y)
abline(coef = m1$coefficients, col = 2)
plot(m1, 5)
summary(m1)

# Make a high-leverage point
x[101] <- 10; y[101] <- 5
m2 <- lm(y ~ x)
plot(x, y)
abline(coef = m2$coefficients, col = 2)
plot(m2, 5)
summary(m2)

## ----outl-3-----------------------------------------------------------------------------
# Access leverage statistics
head(influence(model = m2, do.coef = FALSE)$hat)

# Another option
h <- hat(x = x)

# 1% most influential points
n <- length(x)
p <- 1
hist(h, breaks = 20)
abline(v = (qchisq(0.99, df = p) + 1) / n, col = 2)

# Standardized residuals
rs <- rstandard(m2)
plot(m2, 2) # QQ-plot
points(qnorm(ppoints(n = n)), sort(rs), col = 2, pch = '+') # Manually computed

## ----laliga-2, eval = FALSE-------------------------------------------------------------
## laliga <- readxl::read_excel("la-liga-2015-2016.xlsx", sheet = 1, col_names = TRUE)
## laliga <- as.data.frame(laliga) # Avoid tibble since it drops row.names

## ----laliga-3---------------------------------------------------------------------------
rownames(laliga) <- laliga$Team # Set teams as case names to avoid factors
laliga$Team <- NULL
laliga <- laliga[, -c(2, 8)] # Do not add irrelevant information
summary(laliga)

## ----pca-1------------------------------------------------------------------------------
# PCA
pcaLaliga <- princomp(laliga, fix_sign = TRUE)
summary(pcaLaliga)
# The standard deviations are the square roots of the eigenvalues
# The cumulative proportion of variance explained accumulates the
# variance explained starting at the first component

# We use fix_sign = TRUE so that the signs of the loadings are
# determined by the first element of each loading, set to be
# non-negative. Otherwise, the signs could change for different OS /
# R versions yielding to opposite interpretations of the PCs

# Plot of variances of each component (screeplot)
plot(pcaLaliga, type = "l")
# Useful for detecting an "elbow" in the graph whose location gives the
# "right" number of components to retain. Ideally, this elbow appears
# when the next variances are almost similar and notably smaller when
# compared with the previous

# Alternatively: plot of the cumulated percentage of variance
# barplot(cumsum(pcaLaliga$sdev^2) / sum(pcaLaliga$sdev^2))

# Computation of PCA from the spectral decomposition
n <- nrow(laliga)
eig <- eigen(cov(laliga) * (n - 1) / n) # By default, cov() computes the
# quasi-variance-covariance matrix that divides by n - 1, but PCA and princomp
# consider the sample variance-covariance matrix that divides by n
A <- eig$vectors

# Same eigenvalues
pcaLaliga$sdev^2 - eig$values

# The eigenvectors (the a_j vectors) are the column vectors in $loadings
pcaLaliga$loadings

# The scores is the representation of the data in the principal components -
# it has the same information as laliga
head(round(pcaLaliga$scores, 4))

# Uncorrelated
corrplot::corrplot(cor(pcaLaliga$scores), addCoef.col = "gray")
# Caution! What happened in the last columns? What happened is that the
# variance for the last principal components is close to zero (because there
# are linear dependencies on the variables; e.g., Points, Wins, Loses, Draws),
# so the computation of the correlation matrix becomes unstable for those
# variables (a 0/0 division takes place)

# Better to inspect the covariance matrix
corrplot::corrplot(cov(pcaLaliga$scores), addCoef.col = "gray", is.corr = FALSE)

# The scores are A' * (X_i - mu). We center the data with scale()
# and then multiply each row by A'
scores <- scale(laliga, center = TRUE, scale = FALSE) %*% A

# Same as (but this is much slower)
# scores <- t(apply(scale(laliga, center = TRUE, scale = FALSE), 1,
#                   function(x) t(A) %*% x))

# Same scores (up to possible changes in signs)
max(abs(abs(pcaLaliga$scores) - abs(scores)))

# Reconstruct the data from all the principal components
head(
  sweep(pcaLaliga$scores %*% t(pcaLaliga$loadings), 2, pcaLaliga$center, "+")
  )

## ----pca-2, fig.show = 'hold', fig.cap = '(ref:pca-2-title)'----------------------------
# Use cor = TRUE to standardize variables (all have unit variance)
# and avoid scale distortions
pcaLaligaStd <- princomp(x = laliga, cor = TRUE, fix_sign = TRUE)
summary(pcaLaligaStd)

# The effects of the distortion can be clearly seen with the biplot
# Variability absorbed by Shots, Shots.on.goal, Fouls.made
biplot(pcaLaliga, cex = 0.75)

# The effects of the variables are more balanced
biplot(pcaLaligaStd, cex = 0.75)

## ----pca-4, echo = FALSE, fig.margin = FALSE, fig.cap = '(ref:pca-4-title)'-------------
biplot(pcaLaligaStd, cex = 0.75)

## ----pca3d, eval = knitr:::is_html_output()---------------------------------------------
pca3d::pca3d(pcaLaligaStd, show.labels = TRUE, biplot = TRUE)
rgl::rglwidget()

## ----pca-5, fig.cap = '(ref:pca-5-title)', fig.show = 'hold'----------------------------
biplot(pcaLaligaStd, choices = c(1, 3)) # 0.7138 proportion of variance
biplot(pcaLaligaStd, choices = c(2, 3)) # 0.2180 proportion of variance

## ----pcr-1------------------------------------------------------------------------------
# A linear model is problematic
mod <- lm(Points ~ . - Wins - Draws - Loses - Matches.without.conceding,
          data = laliga)
summary(mod) # Lots of non-significant predictors

# We try to clean the model
modBIC <- MASS::stepAIC(mod, k = log(nrow(laliga)), trace = 0)
summary(modBIC) # Better, but still unsatisfactory

# Also, huge multicollinearity
car::vif(modBIC)

# A quick way of removing columns without knowing its position
laligaRed <- subset(laliga, select = -c(Points, Wins, Draws, Loses,
                                        Matches.without.conceding))
# PCA without Points, Wins, Draws, Loses, and Matches.without.conceding
pcaLaligaRed <- princomp(x = laligaRed, cor = TRUE, fix_sign = TRUE)
summary(pcaLaligaRed) # l = 3 gives 86% of variance explained

# Interpretation of PC1 and PC2
biplot(pcaLaligaRed)
# PC1: attack performance of the team

# Create a new dataset with the response + principal components
laligaPCA <- data.frame("Points" = laliga$Points, pcaLaligaRed$scores)

# Regression on all the principal components
modPCA <- lm(Points ~ ., data = laligaPCA)
summary(modPCA) # Predictors clearly significative -- same R^2 as mod
car::vif(modPCA) # No problems at all

# Using the first three components
modPCA3 <- lm(Points ~ Comp.1 + Comp.2 + Comp.3, data = laligaPCA)
summary(modPCA3)

# Coefficients associated to each original predictor (gamma)
alpha <- modPCA3$coefficients
gamma <- pcaLaligaRed$loadings[, 1:3] %*% alpha[-1] # Slopes
gamma <- c(alpha[1] - pcaLaligaRed$center %*% gamma, gamma) # Intercept
names(gamma) <- c("Intercept", rownames(pcaLaligaRed$loadings))
gamma

# We can overpenalize to have a simpler model -- also one single
# principal component does quite well
modPCABIC <- MASS::stepAIC(modPCA, k = 2 * log(nrow(laliga)), trace = 0)
summary(modPCABIC)
# Note that the order of the principal components does not correspond
# exactly to its importance in the regression!

# To perform prediction we need to compute first the scores associated to the
# new values of the predictors, conveniently preprocessed
# Predictions for FCB and RMA (although they are part of the training sample)
newPredictors <- laligaRed[1:2, ]
newPredictors <- scale(newPredictors, center = pcaLaligaRed$center,
                       scale = pcaLaligaRed$scale) # Centered and scaled
newScores <- t(apply(newPredictors, 1,
                     function(x) t(pcaLaligaRed$loadings) %*% x))

# We need a data frame for prediction
newScores <- data.frame("Comp" = newScores)
predict(modPCABIC, newdata = newScores, interval = "prediction")

# Reality
laliga[1:2, 1]

## ----pcr-2------------------------------------------------------------------------------
# Create a dataset without the problematic predictors and with the response
laligaRed2 <- subset(laliga, select = -c(Wins, Draws, Loses,
                                         Matches.without.conceding))

# Simple call to pcr
library(pls)
modPcr <- pcr(Points ~ ., data = laligaRed2, scale = TRUE)
# Notice we do not need to create a data.frame with PCA, it is automatically
# done within pcr. We also have flexibility to remove predictors from the PCA
# scale = TRUE means that the predictors are scaled internally before computing
# PCA

# The summary of the model is different
summary(modPcr)
# First row: percentage of variance explained of the predictors
# Second row: percentage of variance explained of Y (the R^2)
# Note that we have the same R^2 for 3 and 12 components as in the previous
# approach

# Slots of information in the model -- most of them as 3-dim arrays with the
# third dimension indexing the number of components considered
names(modPcr)

# The coefficients of the original predictors (gammas), not of the components!
modPcr$coefficients[, , 12]
# pcr() computes up to ncomp (in this case, 12) linear models, each one
# considering one extra principal component. $coefficients returns in a
# 3-dim array the coefficients of all the linear models

# Prediction is simpler and can be done for different number of components
predict(modPcr, newdata = laligaRed2[1:2, ], ncomp = 12)

# Selecting the number of components to retain. All the components up to ncomp
# are selected, no further flexibility is possible
modPcr2 <- pcr(Points ~ ., data = laligaRed2, scale = TRUE, ncomp = 3)
summary(modPcr2)

# Selecting the number of components to retain by Leave-One-Out
# cross-validation
modPcrCV1 <- pcr(Points ~ ., data = laligaRed2, scale = TRUE,
                 validation = "LOO")
summary(modPcrCV1)

# View cross-validation Mean Squared Error in Prediction
validationplot(modPcrCV1, val.type = "MSEP") # l = 8 gives the minimum CV
# The black is the CV loss, the dashed red line is the adjCV loss, a bias
# corrected version of the MSEP (not described in the notes)

# Selecting the number of components to retain by 10-fold Cross-Validation
# (k = 10 is the default, this can be changed with the argument segments)
modPcrCV10 <- pcr(Points ~ ., data = laligaRed2, scale = TRUE,
                  validation = "CV")
summary(modPcrCV10)
validationplot(modPcrCV10, val.type = "MSEP") # l = 8 gives the minimum CV

## ----pcr-3------------------------------------------------------------------------------
# Equality of loadings from princomp() and pcr()
max(abs(abs(pcaLaligaRed$loadings[, 1:3]) - abs(modPcr$loadings[, 1:3])))

# Equality of scores from princomp() and pcr() (with the same standardization)
max(abs(abs(pcaLaligaRed$scores[, 1:3]) -
          abs(modPcr$scores[, 1:3] * sqrt(n / (n - 1)))))

# Equality of the gamma coefficients obtained previously for 3 PCA
# (with the same standardization)
modPcr$coefficients[, , 3] / sqrt(n / (n - 1))
gamma[-1]

# Coefficients associated to the principal components -- same as in modPCA3
lm(Points ~ ., data = data.frame("Points" = laliga$Points,
                                 modPcr$scores[, 1:3] * sqrt(n / (n - 1))))
modPCA3
# Of course, flipping of signs is always possible with PCA

## ----plsr-1-----------------------------------------------------------------------------
# Simple call to plsr -- very similar to pcr
modPlsr <- plsr(Points ~ ., data = laligaRed2, scale = TRUE)

# The summary of the model
summary(modPlsr)
# First row: percentage of variance explained of the predictors
# Second row: percentage of variance explained of Y (the R^2)
# Note we have the same R^2 for 12 components as in the linear model

# Slots of information in the model
names(modPlsr)

# PLS scores
head(modPlsr$scores)

# Also uncorrelated
head(cov(modPlsr$scores))

# The coefficients of the original predictors, not of the components!
modPlsr$coefficients[, , 2]

# Obtaining the coefficients of the PLS components
lm(formula = Points ~., data = data.frame("Points" = laliga$Points,
                                          modPlsr$scores[, 1:3]))

# Prediction
predict(modPlsr, newdata = laligaRed2[1:2, ], ncomp = 12)

# Selecting the number of components to retain
modPlsr2 <- plsr(Points ~ ., data = laligaRed2, scale = TRUE, ncomp = 2)
summary(modPlsr2)

# Selecting the number of components to retain by Leave-One-Out cross-validation
modPlsrCV1 <- plsr(Points ~ ., data = laligaRed2, scale = TRUE,
                   validation = "LOO")
summary(modPlsrCV1)

# View cross-validation Mean Squared Error Prediction
validationplot(modPlsrCV1, val.type = "MSEP") # l = 4 gives the minimum CV

# Selecting the number of components to retain by 10-fold Cross-Validation
# (k = 10 is the default)
modPlsrCV10 <- plsr(Points ~ ., data = laligaRed2, scale = TRUE,
                    validation = "CV")
summary(modPlsrCV10)
validationplot(modPlsrCV10, val.type = "MSEP")
# l = 4 is close to the minimum CV

# Regress manually Points in the scores, in order to have an lm object
# Create a new dataset with the response + PLS components
laligaPLS <- data.frame("Points" = laliga$Points, cbind(modPlsr$scores))

# Regression on the first two PLS
modPlsr <- lm(Points ~ Comp.1 + Comp.2, data = laligaPLS)
summary(modPlsr) # Predictors clearly significative -- same R^2 as in modPlsr2
car::vif(modPlsr) # No problems at all