library("car")
library("ggplot2")
set.seed(123125)
data("Duncan")

# Select folds for each observation
#' Number of observations
n <- nrow(Duncan)
# Number of folds
n_folds <- 5
#' Folds for each observation
#' Using the remainder of dividing by number of folds gives equal numbers
#' Using sample shuffles the observations.
fold <- sample((seq_len(n) %% n_folds) + 1)

#' Formula for the model
form <- prestige ~ education

# Vector to hold MSE of the folds
mse_folds <- rep(NA, n_folds)

# Loop through each fold
for (i in seq_len(n_folds)) {
  # Test obervations
  test_data <- Duncan[which(fold == i), ]
  # Training observations
  training_data <- Duncan[which(fold != i), ]
  # estimate model on training data
  mod <- lm(form, data = training_data)
  # Get residuals for predictions
  err <- predict(mod, newdata = test_data) - test_data[["prestige"]]
  mse_folds[i] <- mean(err ^ 2)
}
cv_mse <- mean(mse_folds)

#' But the point of using Cross-Validation is to compare several models
#' 
#' We could copy-and paste the code above, and change the formula each time.
#' Instead, we will use a for-loop to run the model for a set of models
#' 

#' Store models in a list
formulae <- list(prestige ~ education,
                 prestige ~ income,
                 prestige ~ type,
                 prestige ~ income + education,
                 prestige ~ income + education + type,
                 prestige ~ type * (income + education),
                 prestige ~ (type + income + education) ^ 2)

#' Run the model for each formula
models <- list()
for (i in seq_along(formulae)) {
  models[[i]] <- lm(formulae[[i]], data = Duncan)
}
# Example: model 1
models[[1]]

#' Dataset
dataset <- Duncan
#' Use 10-fold cross-validation
n_folds <- 10

#' Loop over each model, and calculate cross-validated MSE
model_cv <- list()
for (i in seq_along(models)) {
  #' Extract the formula from the model
  form <- as.formula(models[[i]])
  cat("Running model: ", format(form), "\n")
  #' number of obs
  n <- nrow(dataset)
  #' folds
  fold <- sample((seq_len(n) %% n_folds) + 1)
  #' dependent variable in the model
  yvar <- as.character(form[[2]])
  # Vector to hold MSE of the folds
  mse_folds <- rep(NA, n_folds)
  # Loop through each fold
  for (k in seq_len(n_folds)) {
    # Test obervations
    data_test <- dataset[which(fold == k), , drop = FALSE]
    # Training observations
    data_train <- dataset[which(fold != k), , drop = FALSE]
    # estimate model on training data
    mod_train <- lm(form, data = data_train)
    # Predicted values for new data
    yhat <- predict(mod_train, newdata = data_test)
    # Prediction erorrs
    err <- yhat - data_test[[yvar]]
    # Mean squared error
    mse_folds[k] <- mean(err ^ 2)
  }
  # Cross-validated MSE for that model
  cv_mse <- mean(mse_folds)

  # In-sample MSE
  mse_train <- mean(residuals(models[[i]]) ^ 2)

  # Save results in a data frame
  model_cv[[i]] <- data_frame(model = i,
                         formula = format(form),
                         mse_cv = cv_mse,
                         mse_train = mse_train)
}
model_cv <- bind_rows(model_cv)

model_cv


#' Other functions:
#' 
#' - package boot: function cv.glm
#' - package DAAG: cv.lm
#' - packages: caret, mlr