## ----------------------------------------------------------------------------
## Name: 04-lm-iii.R
## Description: Script for Chapter 4 of "Notes for Predictive Modeling"
## Link: https://bookdown.org/egarpor/PM-UC3M/
## License: https://creativecommons.org/licenses/by-nc-nd/4.0/
## Author: Eduardo García-Portugués
## Version: 5.10.0
## ----------------------------------------------------------------------------

## ----norms-3, echo = FALSE, eval = knitr:::is_html_output()-----------------------------
# Graphing function
M <- plot3D::mesh(x = seq(0, 1, l = 20), y = seq(-pi, pi, l = 4 * 4 + 1))
x <- M$x
y <- M$y
graph <- function(r, f = 1, col, ...) {
  
  plot3Drgl::surf3Drgl(f * sign(x) * (x^2)^(1/r), 
                       f * (1 - x^2)^(1/r) * sign(cos(y)) * (cos(y)^2)^(1/r), 
                       f * (1 - x^2)^(1/r) * sign(sin(y)) * (sin(y)^2)^(1/r), 
                       smooth = FALSE, lighting = FALSE, col = col, 
                       alpha = 0.35, border = "black", ...)
  
}

# Plot
graph(r = 3, f = 1, col = "green", xlim = c(-1.25, 1.25), 
      ylim = c(-1.25, 1.25), zlim = c(-1.25, 1.25))
graph(r = 2, f = 1, col = "blue", add = TRUE)
graph(r = 1, f = 1, col = "red", add = TRUE)
rgl::par3d(userMatrix = matrix(c(0.30811620, -0.95120299, -0.01665032, 0,
                                 0.09989159, 0.01494217, 0.99488610, 0,
                                 -0.94608986, -0.30820376, 0.09962107, 0,
                                 0, 0, 0, 1), 
                               nrow = 4, ncol = 4, byrow = TRUE))
rgl::rglwidget()

## ----shrinkage--------------------------------------------------------------------------
# Load data -- baseball players statistics
data(Hitters, package = "ISLR")

# Discard NA's
Hitters <- na.omit(Hitters)

# The glmnet function works with the design matrix of predictors (without
# the ones). This can be obtained easily through model.matrix()
x <- model.matrix(Salary ~ 0 + ., data = Hitters) 
# 0 + to exclude a column of 1's for the intercept, since the intercept will be 
# added by default in glmnet::glmnet and if we do not exclude it here we will 
# end with two intercepts, one of them resulting in NA. In the newer versions of
# glmnet this step is luckily not necessary

# Interestingly, note that in Hitters there are two-level factors and these
# are automatically transformed into dummy variables in x -- the main advantage 
# of model.matrix
head(Hitters[, 14:20])
head(x[, 14:19])

# We also need the vector of responses
y <- Hitters$Salary

## ----modelmatrix------------------------------------------------------------------------
# Data with NA in the first observation
data_na <- data.frame("x1" = rnorm(3), "x2" = rnorm(3), "y" = rnorm(3))
data_na$x1[1] <- NA

# The first observation disappears!
model.matrix(y ~ 0 + ., data = data_na)

# Still ignores NA's
model.matrix(y ~ 0 + ., data = data_na, na.action = "na.pass")

# Does not ignore NA's
model.matrix.lm(y ~ 0 + ., data = data_na, na.action = "na.pass")

## ----ridge-1----------------------------------------------------------------------------
# Call to the main function -- use alpha = 0 for ridge regression
library(glmnet)
ridgeMod <- glmnet(x = x, y = y, alpha = 0)
# By default, it computes the ridge solution over a set of lambdas
# automatically chosen. It also standardizes the variables by default to make
# the model fitting since the penalization is scale-sensitive. Importantly,
# the coefficients are returned on the original scale of the predictors

# Plot of the solution path -- gives the value of the coefficients for different
# measures in xvar (penalization imposed to the model or fitness)
plot(ridgeMod, xvar = "norm", label = TRUE)
# xvar = "norm" is the default: L1 norm of the coefficients sum_j abs(beta_j)

# Versus lambda
plot(ridgeMod, label = TRUE, xvar = "lambda")

# Versus the percentage of deviance explained -- this is a generalization of the
# R^2 for generalized linear models. Since we have a linear model, this is the
# same as the R^2
plot(ridgeMod, label = TRUE, xvar = "dev")
# The maximum R^2 is slightly above 0.5

# Indeed, we can see that R^2 = 0.5461
summary(lm(Salary ~., data = Hitters))$r.squared

# Some persistently important predictors are 16, 14, and 15
colnames(x)[c(16, 14, 15)]

# What is inside glmnet's output?
names(ridgeMod)

# lambda versus R^2 -- fitness decreases when sparsity is introduced, in
# in exchange of better variable interpretation and avoidance of overfitting
plot(log(ridgeMod$lambda), ridgeMod$dev.ratio, type = "l",
     xlab = "log(lambda)", ylab = "R2")
ridgeMod$dev.ratio[length(ridgeMod$dev.ratio)]
# Slightly different to lm's because it compromises accuracy for speed

# The coefficients for different values of lambda are given in $a0 (intercepts)
# and $beta (slopes) or, alternatively, both in coef(ridgeMod)
length(ridgeMod$a0)
dim(ridgeMod$beta)
length(ridgeMod$lambda) # 100 lambda's were automatically chosen

# Inspecting the coefficients associated to the 50th lambda
coef(ridgeMod)[, 50]
ridgeMod$lambda[50]

# Zoom in path solution
plot(ridgeMod, label = TRUE, xvar = "lambda",
     xlim = log(ridgeMod$lambda[50]) + c(-2, 2), ylim = c(-30, 10))
abline(v = log(ridgeMod$lambda[50]))
points(rep(log(ridgeMod$lambda[50]), nrow(ridgeMod$beta)), ridgeMod$beta[, 50],
       pch = 16, col = 1:6)

# The squared l2-norm of the coefficients decreases as lambda increases
plot(log(ridgeMod$lambda), sqrt(colSums(ridgeMod$beta^2)), type = "l",
     xlab = "log(lambda)", ylab = "l2 norm")

## ----ridge-2----------------------------------------------------------------------------
# If we want, we can choose manually the grid of penalty parameters to explore
# The grid should be descending
ridgeMod2 <- glmnet(x = x, y = y, alpha = 0, lambda = 100:1)
plot(ridgeMod2, label = TRUE, xvar = "lambda") # Not a good choice!

# Lambda is a tuning parameter that can be chosen by cross-validation, using as
# error the MSE (other possible error can be considered for generalized models
# using the argument type.measure)

# 10-fold cross-validation. Change the seed for a different result
set.seed(12345)
kcvRidge <- cv.glmnet(x = x, y = y, alpha = 0, nfolds = 10)

# The lambda that minimizes the CV error is
kcvRidge$lambda.min

# Equivalent to
indMin <- which.min(kcvRidge$cvm)
kcvRidge$lambda[indMin]

# The minimum CV error
kcvRidge$cvm[indMin]
min(kcvRidge$cvm)

# Potential problem! Minimum occurs at one extreme of the lambda grid in which
# CV is done. The grid was automatically selected, but can be manually inputted
range(kcvRidge$lambda)
lambdaGrid <- 10^seq(log10(kcvRidge$lambda[1]), log10(0.1),
                     length.out = 150) # log-spaced grid
kcvRidge2 <- cv.glmnet(x = x, y = y, nfolds = 10, alpha = 0,
                       lambda = lambdaGrid)

# Much better
plot(kcvRidge2)
kcvRidge2$lambda.min

# But the CV curve is random, since it depends on the sample. Its variability
# can be estimated by considering the CV curves of each fold. An alternative
# approach to select lambda is to choose the largest within one standard
# deviation of the minimum error, in order to favor simplicity of the model
# around the optimal lambda value. This is known as the "one standard error rule"
kcvRidge2$lambda.1se

# Location of both optimal lambdas in the CV loss function in dashed vertical
# lines, and lowest CV error and lowest CV error + one standard error
plot(kcvRidge2)
indMin2 <- which.min(kcvRidge2$cvm)
abline(h = kcvRidge2$cvm[indMin2] + c(0, kcvRidge2$cvsd[indMin2]))
# The consideration of the one standard error rule for selecting lambda makes
# special sense when the CV function is quite flat around the minimum (hence an
# overpenalization that gives more sparsity does not affect so much the CV loss)

# Leave-one-out cross-validation. More computationally intense but completely
# objective in the choice of the fold-assignment
ncvRidge <- cv.glmnet(x = x, y = y, alpha = 0, nfolds = nrow(Hitters),
                      lambda = lambdaGrid)

# Location of both optimal lambdas in the CV loss function
plot(ncvRidge)

## ----ridge-3----------------------------------------------------------------------------
# Inspect the best models (the glmnet fit is inside the output of cv.glmnet)
plot(kcvRidge2$glmnet.fit, label = TRUE, xvar = "lambda")
abline(v = log(c(kcvRidge2$lambda.min, kcvRidge2$lambda.1se)))

# The model associated to lambda.1se (or any other lambda not included in the
# original path solution -- obtained by an interpolation) can be retrieved with
predict(kcvRidge2, type = "coefficients", s = kcvRidge2$lambda.1se)

# Predictions for the first two observations
predict(kcvRidge2, type = "response", s = kcvRidge2$lambda.1se,
        newx = x[1:2, ])

# Predictions for the first observation, for all the lambdas. We can see how
# the prediction for one observation changes according to lambda
plot(log(kcvRidge2$lambda),
     predict(kcvRidge2, type = "response", newx = x[1, , drop = FALSE],
             s = kcvRidge2$lambda),
     type = "l", xlab = "log(lambda)", ylab = " Prediction")

## ----ridge-4----------------------------------------------------------------------------
# Random data
p <- 5
n <- 200
beta <- seq(-1, 1, l = p)
set.seed(123124)
x <- matrix(rnorm(n * p), n, p)
y <- 1 + x %*% beta + rnorm(n)

# Unrestricted fit
fit <- glmnet(x, y, alpha = 0, lambda = 0, intercept = TRUE, 
              standardize = FALSE)
beta0Hat <- rbind(fit$a0, fit$beta)
beta0Hat

# Unrestricted fit matches least squares -- but recall glmnet uses an
# iterative method so it is inexact (convergence threshold thresh = 1e-7 by 
# default)
X <- model.matrix(y ~ x) # A way of constructing a design matrix that is a
# data.frame and has a column of ones
solve(crossprod(X)) %*% t(X) %*% y

# Restricted fit
# glmnet considers as the regularization parameter "lambda" the value
# lambda / n (lambda being here the penalty parameter employed in the theory)
lambda <- 2
fit <- glmnet(x, y, alpha = 0, lambda = lambda / n, intercept = TRUE,
              standardize = FALSE)
betaLambdaHat <- rbind(fit$a0, fit$beta)
betaLambdaHat

# Analytical form with intercept
solve(crossprod(X) + diag(c(0, rep(lambda, p)))) %*% t(X) %*% y

## ----lasso-1----------------------------------------------------------------------------
# Get the Hitters data back
x <- model.matrix(Salary ~ 0 + ., data = Hitters) 
y <- Hitters$Salary

# Call to the main function -- use alpha = 1 for lasso regression (the default)
lassoMod <- glmnet(x = x, y = y, alpha = 1)
# Same defaults as before, same object structure

# Plot of the solution path -- now the paths are not smooth when decreasing to
# zero (they are zero exactly). This is a consequence of the l1 norm
plot(lassoMod, xvar = "lambda", label = TRUE)
# Some persistently important predictors are 16 and 14

# Versus the R^2 -- same maximum R^2 as before
plot(lassoMod, label = TRUE, xvar = "dev")

# Now the l1-norm of the coefficients decreases as lambda increases
plot(log(lassoMod$lambda), colSums(abs(lassoMod$beta)), type = "l",
     xlab = "log(lambda)", ylab = "l1 norm")

# 10-fold cross-validation. Change the seed for a different result
set.seed(12345)
kcvLasso <- cv.glmnet(x = x, y = y, alpha = 1, nfolds = 10)

# The lambda that minimizes the CV error
kcvLasso$lambda.min

# The "one standard error rule" for lambda
kcvLasso$lambda.1se

# Location of both optimal lambdas in the CV loss function
indMin <- which.min(kcvLasso$cvm)
plot(kcvLasso)
abline(h = kcvLasso$cvm[indMin] + c(0, kcvLasso$cvsd[indMin]))
# No problems now: the minimum does not occur at one extreme
# Interesting: note that the numbers on top of the figure give the number of
# coefficients *exactly* different from zero -- the number of predictors
# effectively considered in the model!
# In this case, the one standard error rule makes also sense

# Leave-one-out cross-validation
lambdaGrid <- 10^seq(log10(kcvLasso$lambda[1]), log10(0.1),
                     length.out = 150) # log-spaced grid
ncvLasso <- cv.glmnet(x = x, y = y, alpha = 1, nfolds = nrow(Hitters),
                      lambda = lambdaGrid)

# Location of both optimal lambdas in the CV loss function
plot(ncvLasso)

## ----lasso-2----------------------------------------------------------------------------
# Inspect the best models
plot(kcvLasso$glmnet.fit, label = TRUE, xvar = "lambda")
abline(v = log(c(kcvLasso$lambda.min, kcvLasso$lambda.1se)))

# The model associated to lambda.min (or any other lambda not included in the
# original path solution -- obtained by an interpolation) can be retrieved with
predict(kcvLasso, type = "coefficients",
        s = c(kcvLasso$lambda.min, kcvLasso$lambda.1se))

# Predictions for the first two observations
predict(kcvLasso, type = "response",
        s = c(kcvLasso$lambda.min, kcvLasso$lambda.1se),
        newx = x[1:2, ])

## ----lasso-3----------------------------------------------------------------------------
# We can use lasso for model selection!
selPreds <- predict(kcvLasso, type = "coefficients",
                    s = c(kcvLasso$lambda.min, kcvLasso$lambda.1se))[-1, ] != 0
x1 <- x[, selPreds[, 1]]
x2 <- x[, selPreds[, 2]]

# Least squares fit with variables selected by lasso
modLassoSel1 <- lm(y ~ x1)
modLassoSel2 <- lm(y ~ x2)
summary(modLassoSel1)
summary(modLassoSel2)

# Comparison with stepwise selection
modBIC <- MASS::stepAIC(lm(Salary ~ ., data = Hitters), k = log(nrow(Hitters)),
                        trace = 0)
summary(modBIC)
# The lasso variable selection is similar, although the model is slightly worse
# in terms of adjusted R^2 and significance of the predictors. However, keep in
# mind that lasso is solving a constrained least squares problem, so it is
# expected to achieve better R^2 and adjusted R^2 via a selection procedure
# that employs solutions of unconstrained least squares. What is remarkable
# is the speed of lasso on selecting variables, and the fact that gives quite
# good starting points for performing further model selection

# Another interesting possibility is to run a stepwise selection starting from
# the set of predictors selected by lasso. In this search, it is important to
# use direction = "both" (default) and define the scope argument adequately
f <- formula(paste("Salary ~", paste(names(which(selPreds[, 2])), 
                                     collapse = " + ")))
start <- lm(f, data = Hitters) # Model with predictors selected by lasso
scope <- list(lower = lm(Salary ~ 1, data = Hitters), # No predictors
              upper = lm(Salary ~ ., data = Hitters)) # All the predictors
modBICFromLasso <- MASS::stepAIC(object = start, k = log(nrow(Hitters)),
                                 scope = scope, trace = 0)
summary(modBICFromLasso)

# Comparison in terms of BIC, slight improvement with modBICFromLasso
BIC(modLassoSel1, modLassoSel2, modBICFromLasso, modBIC)

## ----lasso-4, fig.cap = '(ref:lasso-4-title)'-------------------------------------------
# Random data with predictors unrelated with the response
p <- 100
n <- 300
set.seed(123124)
x <- matrix(rnorm(n * p), n, p)
y <- 1 + rnorm(n)

# CV
lambdaGrid <- exp(seq(-10, 3, l = 200))
plot(cv.glmnet(x = x, y = y, alpha = 1, nfolds = n, lambda = lambdaGrid))

## ----constr-1---------------------------------------------------------------------------
# Simulate data
set.seed(123456)
n <- 50
p <- 3
x1 <- rnorm(n, mean = 1)
x2 <- rnorm(n, mean = 2)
x3 <- rnorm(n, mean = 3)
eps <- rnorm(n, sd = 0.5)
y <- 1 + 2 * x1 - 3 * x2 + x3 + eps

# Center the data and compute design matrix
x1Cen <- x1 - mean(x1)
x2Cen <- x2 - mean(x2)
x3Cen <- x3 - mean(x3)
yCen <- y - mean(y)
X <- cbind(x1Cen, x2Cen, x3Cen)

# Linear restriction: use that 
# beta_1 + beta_2 + beta_3 = 0
# beta_2 = -3
# In this case q = 2. The restriction is codified as
A <- rbind(c(1, 1, 1),
           c(0, 1, 0))
c <- c(0, -3)

# Fit model without intercept
S <- solve(crossprod(X))
beta_hat <- S %*% t(X) %*% yCen
beta_hat

# Restricted fit enforcing A * beta = c
beta_hat_A <- beta_hat + 
  S %*% t(A) %*% solve(A %*% S %*% t(A)) %*% (c - A %*% beta_hat)
beta_hat_A

# Intercept of the constrained fit
beta_hat_A_0 <- mean(y) - c(mean(x1), mean(x2), mean(x3)) %*% beta_hat_A
beta_hat_A_0

## ----multr-1----------------------------------------------------------------------------
# Dimensions and sample size
p <- 3
q <- 2
n <- 100

# A quick way of creating a non-diagonal (valid) covariance matrix for the
# errors
Sigma <- 3 * toeplitz(seq(1, 0.1, l = q)) 
set.seed(12345)
X <- mvtnorm::rmvnorm(n = n, mean = 1:p, sigma = diag(0.5, nrow = p, ncol = p))
E <- mvtnorm::rmvnorm(n = n, mean = rep(0, q), sigma = Sigma)

# Linear model
B <- matrix((-1)^(1:p) * (1:p), nrow = p, ncol = q, byrow = TRUE)
Y <- X %*% B + E

# Fitting the model (note: Y and X are matrices!)
mod <- lm(Y ~ X)
mod
# Note that the intercept is markedly different from zero -- that is because
# X is not centered

# Compare with B
B

# Summary of the model: gives q separate summaries, one for each fitted
# univariate model
summary(mod)

# Exactly equivalent to
summary(lm(Y[, 1] ~ X))
summary(lm(Y[, 2] ~ X))

## ----multr-2----------------------------------------------------------------------------
# When we want to add several variables of a dataset as responses through a 
# formula interface, we have to use cbind() in the response. Doing 
# "Petal.Width + Petal.Length ~ ..." is INCORRECT, as lm will understand 
# "I(Petal.Width + Petal.Length) ~ ..." and do one single regression

# Predict Petal's measurements from Sepal's 
modIris <- lm(cbind(Petal.Width, Petal.Length) ~ 
                Sepal.Length + Sepal.Width + Species, data = iris)
summary(modIris)

# The fitted values and residuals are now matrices
head(modIris$fitted.values)
head(modIris$residuals)

# The individual models
modIris1 <- lm(Petal.Width ~Sepal.Length + Sepal.Width + Species, data = iris)
modIris2 <- lm(Petal.Length ~Sepal.Length + Sepal.Width + Species, data = iris)
summary(modIris1)
summary(modIris2)

## ----multr-3----------------------------------------------------------------------------
# Confidence intervals for the parameters
confint(modIris)
# Warning! Do not confuse Petal.Width:Sepal.Length with an interaction term!
# It is meant to represent the Response:Predictor coefficient

# Prediction -- now more limited without confidence intervals implemented
predict(modIris, newdata = iris[1:3, ])

# MANOVA table
manova(modIris)

# "Same" as the "Sum Sq" and "Df" entries of
anova(modIris1)
anova(modIris2)

# anova() serves for assessing the significance of including a new predictor
# for explaining all the responses. This is based on an extension of the
# *sequential* ANOVA table briefly covered in Section 2.6. The hypothesis test
# is by default conducted with the Pillai statistic (an extension of the F-test)
anova(modIris)

## ----multr-4----------------------------------------------------------------------------
# Simulate data
n <- 500
p <- 50
q <- 10
set.seed(123456)
X <- mvtnorm::rmvnorm(n = n, mean = p:1, sigma = 5 * 0.5^toeplitz(1:p))
E <- mvtnorm::rmvnorm(n = n, mean = rep(0, q), sigma = toeplitz(q:1))
B <- 5 * (2 / (0.5 * (1:p - 15)^2 + 2) + 1 / (0.1 * (1:p - 40)^2 + 1)) %*%
  t(1 / sqrt(1:q))
Y <- X %*% B + E

# Visualize B -- dark violet is close to 0
image(1:q, 1:p, t(B), col = viridisLite::viridis(20), xlab = "q", ylab = "p")

# Lasso path fit
mfit <- glmnet(x = X, y = Y, family = "mgaussian", alpha = 1)

# A list of models for each response
str(mfit$beta, 1)

# Tuning parameter selection by 10-fold cross-validation
set.seed(12345)
kcvLassoM <- cv.glmnet(x = X, y = Y, family = "mgaussian", alpha = 1)
kcvLassoM$lambda.min
kcvLassoM$lambda.1se

# Location of both optimal lambdas in the CV loss function
indMin <- which.min(kcvLassoM$cvm)
plot(kcvLassoM)
abline(h = kcvLassoM$cvm[indMin] + c(0, kcvLassoM$cvsd[indMin]))

# Extract the coefficients associated to some fits
coefs <- predict(kcvLassoM, type = "coefficients",
                 s = c(kcvLassoM$lambda.min, kcvLassoM$lambda.1se))
str(coefs, 1)

# Predictions for the first two observations
preds <- predict(kcvLassoM, type = "response",
                 s = c(kcvLassoM$lambda.min, kcvLassoM$lambda.1se),
                 newx = X[1:2, ])
preds

## ----multr-5, eval = FALSE--------------------------------------------------------------
## manipulate::manipulate({
## 
##   # Color
##   col <- viridisLite::viridis(20)
## 
##   # Common zlim
##   zlim <- range(B) + c(-0.25, 0.25)
## 
##   # Plot true B
##   par(mfrow = c(1, 2))
##   image(1:q, 1:p, t(B), col = col, xlab = "q", ylab = "p", zlim = zlim,
##         main = "B")
## 
##   # Extract B_hat from the lasso fit, a p x q matrix
##   B_hat <- sapply(seq_along(mfit$beta), function(i) mfit$beta[i][[1]][, j])
## 
##   # Put as black rows the predictors included
##   not_zero <- abs(B_hat) > 0
##   image(1:q, 1:p, t(not_zero), breaks = c(0.5, 1),
##         col = rgb(1, 1, 1, alpha = 0.1), add = TRUE)
## 
##   # For B_hat
##   image(1:q, 1:p, t(B_hat), col = col, xlab = "q", ylab = "p", zlim = zlim,
##         main = "Bhat")
##   image(1:q, 1:p, t(not_zero), breaks = c(0.5, 1),
##         col = rgb(1, 1, 1, alpha = 0.1), add = TRUE)
## 
## }, j = manipulate::slider(min = 1, max = ncol(mfit$beta$y1), step = 1,
##                           label = "j in lambda(j)"))

## ----biglm-1----------------------------------------------------------------------------
# Not really "big data", but for the sake of illustration
set.seed(12345)
n <- 1e6
p <- 10
beta <- seq(-1, 1, length.out = p)^5
x1 <- matrix(rnorm(n * p), nrow = n, ncol = p)
x1[, p] <- 2 * x1[, 1] + rnorm(n, sd = 0.1) # Add some dependence to predictors
x1[, p - 1] <- 2 - x1[, 2] + rnorm(n, sd = 0.5)
y1 <- 1 + x1 %*% beta + rnorm(n)
x2 <- matrix(rnorm(100 * p), nrow = 100, ncol = p)
y2 <- 1 + x2 %*% beta + rnorm(100)
bigData1 <- data.frame("resp" = y1, "pred" = x1)
bigData2 <- data.frame("resp" = y2, "pred" = x2)

# biglm has a very similar syntax to lm -- but the formula interface does not
# work always as expected
# biglm::biglm(formula = resp ~ ., data = bigData1) # Does not work
# biglm::biglm(formula = y ~ x) # Does not work
# biglm::biglm(formula = resp ~ pred.1 + pred.2, data = bigData1) # Does work,
# but not very convenient for a large number of predictors
# Hack for automatic inclusion of all the predictors
f <- formula(paste("resp ~", paste(names(bigData1)[-1], collapse = " + ")))
biglmMod <- biglm::biglm(formula = f, data = bigData1)

# lm's call
lmMod <- lm(formula = resp ~ ., data = bigData1)

# The reduction in size of the resulting object is more than notable
print(object.size(biglmMod), units = "KB")
print(object.size(lmMod), units = "MB")

# Summaries
s1 <- summary(biglmMod)
s2 <- summary(lmMod)
s1
s2

# Further information
s1$mat # Coefficients and their inferences
s1$rsq # R^2
s1$nullrss # SST (as in Section 2.6)

# Extract coefficients
coef(biglmMod)

# Prediction works as usual
predict(biglmMod, newdata = bigData2[1:5, ])

# Must contain a column for the response
# predict(biglmMod, newdata = bigData2[1:5, -1]) # Error

# Update the model with training data
update(biglmMod, moredata = bigData2)

# AIC and BIC
AIC(biglmMod, k = 2)
AIC(biglmMod, k = log(n))

# Features not immediately available for biglm objects: stepwise selection by
# stepAIC, residuals, variance of the error, model diagnostics, and vifs

# Workaround for obtaining hat(sigma)^2 = SSE / (n - p - 1), SSE = SST * (1 - R^2)
(s1$nullrss * (1 - s1$rsq)) / s1$obj$df.resid
s2$sigma^2

## ----biglm-2, fig.cap = '(ref:biglm-2-title)', fig.margin = FALSE-----------------------
# Model selection adapted to big data models
reg <- leaps::regsubsets(biglmMod, nvmax = p, method = "exhaustive")
plot(reg) # Plot best model (top row) to worst model (bottom row)

# Summarize (otherwise regsubsets's output is hard to decipher)
subs <- summary(reg)
subs

# Lots of useful information
str(subs, 1)

# Get the model with lowest BIC
subs$which
subs$bic
subs$which[which.min(subs$bic), ]

# Show the display in Figure 4.6
subs$which[order(subs$bic), ]

# It also works with ordinary linear models and it is much faster and
# informative than stepAIC
reg <- leaps::regsubsets(resp ~ ., data = bigData1, nvmax = p,
                         method = "backward")
subs <- summary(reg)
subs$bic
subs$which[which.min(subs$bic), ]

# Compare it with stepAIC
MASS::stepAIC(lm(resp ~ ., data = bigData1), trace = 0,
              direction = "backward", k = log(n))

## ----biglm-3----------------------------------------------------------------------------
# Size of the response
print(object.size(rnorm(1e6)) * 1e2, units = "GB")

# Size of the predictors
print(object.size(rnorm(1e6)) * 1e2 * 10, units = "GB")

## ----biglm-4----------------------------------------------------------------------------
# Linear regression with n = 10^8 and p = 10
n <- 10^8
p <- 10
beta <- seq(-1, 1, length.out = p)^5

# Number of chunks for splitting the dataset
nChunks <- 1e3
nSmall <- n / nChunks

# Simulates reading the first chunk of data
set.seed(12345)
x <- matrix(rnorm(nSmall * p), nrow = nSmall, ncol = p)
x[, p] <- 2 * x[, 1] + rnorm(nSmall, sd = 0.1)
x[, p - 1] <- 2 - x[, 2] + rnorm(nSmall, sd = 0.5)
y <- 1 + x %*% beta + rnorm(nSmall)

# First fit
bigMod <- biglm::biglm(y ~ x, data = data.frame(y, x))

# Update fit
# pb <- txtProgressBar(style = 3)
for (i in 2:nChunks) {

  # Simulates reading the i-th chunk of data
  set.seed(12345 + i)
  x <- matrix(rnorm(nSmall * p), nrow = nSmall, ncol = p)
  x[, p] <- 2 * x[, 1] + rnorm(nSmall, sd = 0.1)
  x[, p - 1] <- 2 - x[, 2] + rnorm(nSmall, sd = 0.5)
  y <- 1 + x %*% beta + rnorm(nSmall)

  # Update the fit
  bigMod <- update(bigMod, moredata = data.frame(y, x))

  # Progress
  # setTxtProgressBar(pb = pb, value = i / nChunks)

}

# Final model
summary(bigMod)
print(object.size(bigMod), units = "KB")