## ----------------------------------------------------------------------------
## Name: 02-lm-i.R
## Description: Script for Chapter 2 of "Notes for Predictive Modeling"
## Link: https://bookdown.org/egarpor/PM-UC3M/
## License: https://creativecommons.org/licenses/by-nc-nd/4.0/
## Author: Eduardo García-Portugués
## Version: 5.10.0
## ----------------------------------------------------------------------------

## ----lscheck-1--------------------------------------------------------------------------
# Generates 50 points from a N(0, 1): predictor and error
set.seed(34567)
x <- rnorm(n = 50)
eps <- rnorm(n = 50)

# Responses
yLin <- -0.5 + 1.5 * x + eps
yQua <- -0.5 + 1.5 * x^2 + eps
yExp <- -0.5 + 1.5 * 2^x + eps

# Data
leastSquares <- data.frame(x = x, yLin = yLin, yQua = yQua, yExp = yExp)

## ----lscheck-2, fig.cap = '(ref:lscheck-2-title)'---------------------------------------
# Call lm
lm(yLin ~ x, data = leastSquares)
lm(yQua ~ x, data = leastSquares)
lm(yExp ~ x, data = leastSquares)

# The lm object
mod <- lm(yLin ~ x, data = leastSquares)
mod

# We can produce a plot with the linear fit easily
plot(x, yLin)
abline(coef = mod$coefficients, col = 2)

# Access coefficients with $coefficients
mod$coefficients

# Compute the minimized RSS
sum((yLin - mod$coefficients[1] - mod$coefficients[2] * x)^2)
sum(mod$residuals^2)

# mod is a list of objects whose names are
names(mod)

## ----lmcov------------------------------------------------------------------------------
# Covariance
Sxy <- cov(x, yLin)

# Variance
Sx2 <- var(x)

# Coefficients
beta1 <- Sxy / Sx2
beta0 <- mean(yLin) - beta1 * mean(x)
c(beta0, beta1)

# Output from lm
mod <- lm(yLin ~ x, data = leastSquares)
mod$coefficients

## ----wintab-1, eval = FALSE-------------------------------------------------------------
## # Read data
## wine <- read.table(file = "wine.csv", header = TRUE, sep = ",")

## ----wintab-2, fig.margin = FALSE, fig.fullwidth = TRUE, out.width = .tex_web('85%', '100%'), fig.cap = '(ref:wintab-2-title)'----
# Numerical -- marginal distributions
summary(wine)

# Graphical -- pairwise relations with linear and "smooth" regressions
car::scatterplotMatrix(wine, col = 1, regLine = list(col = 2),
                       smooth = list(col.smooth = 4, col.spread = 4))

## ----wintab-3---------------------------------------------------------------------------
# Set row names to Year -- useful for outlier identification
row.names(wine) <- wine$Year
wine$Year <- NULL

## ----pred-------------------------------------------------------------------------------
# Price ~ AGST
modAGST <- lm(Price ~ AGST, data = wine)

# Summary of the model
summary(modAGST)

# The summary is also an object
sumModAGST <- summary(modAGST)
names(sumModAGST)

# R^2
sumModAGST$r.squared

## ----mods, echo = FALSE-----------------------------------------------------------------
modFrancePop <- lm(Price ~ FrancePop, data = wine)
modAge <- lm(Price ~ Age, data = wine)
modWinterRain <- lm(Price ~ WinterRain, data = wine)
modHarvestRain <- lm(Price ~ HarvestRain, data = wine)

## ----lscheck3d-1, echo = FALSE----------------------------------------------------------
# Generates 50 points from a N(0, 1): predictors and error
set.seed(34567) # Fixes the seed for the random generator
x1 <- rnorm(50)
x2 <- rnorm(50)
x3 <- x1 + rnorm(50, sd = 0.05) # Make variables dependent
eps <- rnorm(50)

# Responses
yLin <- -0.5 + 0.5 * x1 + 0.5 * x2 + eps
yQua <- -0.5 + x1^2 + 0.5 * x2 + eps
yExp <- -0.5 + 0.5 * exp(x2) + x3 + eps

# Data
leastSquares3D <- data.frame(x1 = x1, x2 = x2, yLin = yLin,
                             yQua = yQua, yExp = yExp)

## ----lscheck3d-2, eval = FALSE----------------------------------------------------------
## load(file = "least-squares-3D.RData")

## ----ls3dcheck-3------------------------------------------------------------------------
# Output from lm
mod <- lm(yLin ~ x1 + x2, data = leastSquares3D)
mod$coefficients

# Matrix X
X <- cbind(1, leastSquares3D$x1, leastSquares3D$x2)

# Vector Y
Y <- leastSquares3D$yLin

# Coefficients
beta <- solve(t(X) %*% X) %*% t(X) %*% Y
# %*% multiplies matrices
# solve() computes the inverse of a matrix
# t() transposes a matrix
beta

## ----fitres, eval = FALSE---------------------------------------------------------------
## # Fitted values
## mod$fitted.values
## 
## # Residuals
## mod$residuals

## ----datamarg---------------------------------------------------------------------------
set.seed(212542)
n <- 100
x1 <- rnorm(n, sd = 2)
x2 <- rnorm(n, mean = x1, sd = 3)
y <- 1 + 2 * x1 - x2 + rnorm(n, sd = 1)
data <- data.frame(x1 = x1, x2 = x2, y = y)

## ----case1-1----------------------------------------------------------------------------
# Regression on all the predictors
modWine1 <- lm(Price ~ Age + AGST + FrancePop + HarvestRain + WinterRain,
               data = wine)

# A shortcut
modWine1 <- lm(Price ~ ., data = wine)
modWine1

# Summary
summary(modWine1)

## ----case1-2----------------------------------------------------------------------------
# Fit
modWine1 <- lm(Price ~ ., data = wine)

# Summary
sumModWine1 <- summary(modWine1)
sumModWine1

# Contains the estimation of sigma ("Residual standard error")
sumModWine1$sigma

# Which is the same as
sqrt(sum(modWine1$residuals^2) / modWine1$df.residual)

## ----case1-3----------------------------------------------------------------------------
modWine2 <- lm(Price ~ . - FrancePop, data = wine)
summary(modWine2)

## ----case1-4----------------------------------------------------------------------------
car::compareCoefs(modWine1, modWine2)

## ----case1-5----------------------------------------------------------------------------
# Fit a new model
modWine3 <- lm(Price ~ Age + WinterRain, data = wine)
summary(modWine3)

# Confidence intervals at 95%
# CI: (lwr, upr)
confint(modWine3)

# Confidence intervals at other levels
confint(modWine3, level = 0.90)
confint(modWine3, level = 0.99)

# Compare with previous models
confint(modWine1)
confint(modWine2)
confint(modWine3)

## ----case1-5b---------------------------------------------------------------------------
# By default, scale centers (subtracts the mean) and scales (divides by the
# standard deviation) the columns of a matrix
wineCen <- data.frame(scale(wine, center = TRUE, scale = FALSE))

# Regression with centered response and predictors
modWine3Cen <- lm(Price ~ Age + WinterRain, data = wineCen)

# Summary
summary(modWine3Cen)

## ----case1-6, error = TRUE--------------------------------------------------------------
# Fit a linear model for the price on WinterRain, HarvestRain, and AGST
modWine4 <- lm(Price ~ WinterRain + HarvestRain + AGST, data = wine)
summary(modWine4)

# Data for which we want a prediction
# Important! You have to name the column with the predictor's name!
weather <- data.frame(WinterRain = 500, HarvestRain = 123, AGST = 18)
weatherBad <- data.frame(500, 123, 18)

# Prediction of the mean

# Prediction of the mean at 95% -- the defaults
predict(modWine4, newdata = weather)
predict(modWine4, newdata = weatherBad) # Error

# Prediction of the mean with 95% confidence interval (the default)
# CI: (lwr, upr)
predict(modWine4, newdata = weather, interval = "confidence")
predict(modWine4, newdata = weather, interval = "confidence", level = 0.95)

# Other levels
predict(modWine4, newdata = weather, interval = "confidence", level = 0.90)
predict(modWine4, newdata = weather, interval = "confidence", level = 0.99)

# Prediction of the response

# Prediction of the mean at 95% -- the defaults
predict(modWine4, newdata = weather)

# Prediction of the response with 95% confidence interval
# CI: (lwr, upr)
predict(modWine4, newdata = weather, interval = "prediction")
predict(modWine4, newdata = weather, interval = "prediction", level = 0.95)

# Other levels
predict(modWine4, newdata = weather, interval = "prediction", level = 0.90)
predict(modWine4, newdata = weather, interval = "prediction", level = 0.99)

# Predictions for several values
weather2 <- data.frame(WinterRain = c(500, 200), HarvestRain = c(123, 200),
                       AGST = c(17, 18))
predict(modWine4, newdata = weather2, interval = "prediction")

## ----simpleAnova------------------------------------------------------------------------
# This function computes the simplified anova from a linear model
simpleAnova <- function(object, ...) {

  # Compute anova table
  tab <- anova(object, ...)

  # Obtain number of predictors
  p <- nrow(tab) - 1

  # Add predictors row
  predictorsRow <- colSums(tab[1:p, 1:2])
  predictorsRow <- c(predictorsRow, predictorsRow[2] / predictorsRow[1])

  # F-quantities
  Fval <- predictorsRow[3] / tab[p + 1, 3]
  pval <- pf(Fval, df1 = p, df2 = tab$Df[p + 1], lower.tail = FALSE)
  predictorsRow <- c(predictorsRow, Fval, pval)

  # Simplified table
  tab <- rbind(predictorsRow, tab[p + 1, ])
  row.names(tab)[1] <- "Predictors"
  return(tab)

}

## ----case1-7----------------------------------------------------------------------------
# Models
modWine1 <- lm(Price ~ ., data = wine)
modWine2 <- lm(Price ~ . - FrancePop, data = wine)

# Simplified table
simpleAnova(modWine1)
simpleAnova(modWine2)
# The null hypothesis of no linear dependence is emphatically rejected in
# both models

# R's ANOVA table -- warning this is not what we saw in lessons
anova(modWine1)

## ----R2bad-1, fig.cap = '(ref:R2bad-1-title)'-------------------------------------------
# Simple linear model

# Create data that:
# 1) does not follow a linear model
# 2) the error is heteroskedastic
x <- seq(0.15, 1, l = 100)
set.seed(123456)
eps <- rnorm(n = 100, sd = 0.25 * x^2)
y <- 1 - 2 * x * (1 + 0.25 * sin(4 * pi * x)) + eps

# Great R^2!?
reg <- lm(y ~ x)
summary(reg)

# scatterplot is a quick alternative to
# plot(x, y)
# abline(coef = reg$coef, col = 3)

# But prediction is obviously problematic
car::scatterplot(y ~ x, col = 1, regLine = list(col = 2), smooth = FALSE)

## ----R2bad-2----------------------------------------------------------------------------
# Multiple linear model

# Create data that:
# 1) does not follow a linear model
# 2) the error is heteroskedastic
x1 <- seq(0.15, 1, l = 100)
set.seed(123456)
x2 <- runif(100, -3, 3)
eps <- rnorm(n = 100, sd = 0.25 * x1^2)
y <- 1 - 3 * x1 * (1 + 0.25 * sin(4 * pi * x1)) + 0.25 * cos(x2) + eps

# Great R^2!?
reg <- lm(y ~ x1 + x2)
summary(reg)

## ----R2bad-4, eval = knitr:::is_html_output()-------------------------------------------
# But prediction is obviously problematic
car::scatter3d(y ~ x1 + x2, fit = "linear")
rgl::rglwidget()

## ----R2bad-3, eval = FALSE--------------------------------------------------------------
## # Generate data
## p <- 198
## n <- 200
## set.seed(3456732)
## beta <- c(0.5, -0.5, rep(0, p - 2))
## X <- matrix(rnorm(n * p), nrow = n, ncol = p)
## Y <- drop(X %*% beta + rnorm(n, sd = 3))
## data <- data.frame(y = Y, x = X)
## 
## # Regression on the two meaningful predictors
## summary(lm(y ~ x.1 + x.2, data = data))
## 
## # Adding 20 "garbage" variables
## # R^2 increases and adjusted R^2 decreases
## summary(lm(y ~ X[, 1:22], data = data))

## ----R2Adjneg---------------------------------------------------------------------------
# Three independent variables
set.seed(234599)
x1 <- rnorm(100)
x2 <- rnorm(100)
y <- 1 + rnorm(100)

# Negative adjusted R^2
summary(lm(y ~ x1 + x2))

## ----R2danger-2-------------------------------------------------------------------------
# Model with intercept
mod1 <- lm(Sepal.Length ~ Petal.Width, data = iris)
mod1

# Model without intercept
mod0 <- lm(Sepal.Length ~ 0 + Petal.Width, data = iris)
mod0

# Recall the different way of obtaining the estimators
X1 <- cbind(1, iris$Petal.Width)
X0 <- cbind(iris$Petal.Width) # No column of ones!
Y <- iris$Sepal.Length
betaHat1 <- solve(crossprod(X1)) %*% t(X1) %*% Y
betaHat0 <- solve(crossprod(X0)) %*% t(X0) %*% Y
betaHat1
betaHat0

# Summaries: higher R^2 for the model with no intercept!?
summary(mod1)
summary(mod0)

# Wait a moment... let's see the actual fit
plot(Sepal.Length ~ Petal.Width, data = iris)
abline(mod1, col = 2) # Obviously, much better
abline(mod0, col = 3)

# Compute the R^2 manually for mod1
SSE1 <- sum(mod1$residuals^2)
SST1 <- sum((mod1$model$Sepal.Length - mean(mod1$model$Sepal.Length))^2)
1 - SSE1 / SST1

# Compute the R^2 manually for mod0
SSE0 <- sum(mod0$residuals^2)
SST0 <- sum((mod0$model$Sepal.Length - mean(mod0$model$Sepal.Length))^2)
1 - SSE0 / SST0
# It is negative!

# Recall that the mean of the residuals is not zero!
mean(mod0$residuals)

# What summary really returns if there is no intercept
n <- nrow(iris)
p <- 1
R0 <- 1 - sum(mod0$residuals^2) / sum(mod0$model$Sepal.Length^2)
R0Adj <- 1 - sum(mod0$residuals^2) / sum(mod0$model$Sepal.Length^2) *
  (n - 1) / (n - p - 1)
R0
R0Adj

# What if we centered the data previously?
irisCen <- data.frame(scale(iris[, -5], center = TRUE, scale = FALSE))
modCen1 <- lm(Sepal.Length ~ Petal.Width, data = irisCen)
modCen0 <- lm(Sepal.Length ~ 0 + Petal.Width, data = irisCen)

# No problem, "correct" R^2
summary(modCen1)
summary(modCen0)

# But only if we center predictor and response...
summary(lm(iris$Sepal.Length ~ 0 + irisCen$Petal.Width))

## ----case1-8----------------------------------------------------------------------------
# Fit models
modWine1 <- lm(Price ~ ., data = wine)
modWine2 <- lm(Price ~ . - FrancePop, data = wine)
modWine3 <- lm(Price ~ Age + WinterRain, data = wine)

# Summaries
summary(modWine1)
summary(modWine2)
summary(modWine3)