library(ggplot2) #graphics library
library(patchwork) #combine graphs
library(dplyr) #manipulate database
library(viridis) #color-blind palette

theme_set(theme_minimal())
options(ggplot2.continuous.colour="viridis")
options(ggplot2.continuous.fill = "viridis")
scale_colour_discrete <- scale_colour_viridis_d
scale_fill_discrete <- scale_fill_viridis_d

library(qqplotr, warn.conflicts = FALSE)


data(insurance, package = "hecstatmod")
insurance <- insurance %>%
  mutate(obese = factor(bmi >= 30, 
                         labels = c("non-obese","obese")),
          smobese = case_when(
            obese=="obese" & smoker=="yes" ~ "smoker/obese",
            obese=="obese" & smoker=="no" ~ "non-smoker/obese",
            obese=="non-obese" ~ "non-obese"),
          smobese = factor(smobese, 
                            levels = c("smoker/obese","non-smoker/obese","non-obese")),
         )

mod <- lm(charges ~ smoker*obese*bmi + age + sex + region,
          data = insurance)
insurance_lm <- insurance %>% 
  mutate(resid = resid(mod),
         ajustee = fitted(mod),
         rstud = rstudent(mod))
g1 <- ggplot(data = insurance_lm, 
             aes(x = ajustee, y = resid)) + 
  geom_point(aes(col = smobese)) +
  geom_smooth() +
  theme(legend.position = "bottom") + 
  ylab("ordinary residuals") + 
  xlab("fitted values")
  
g2 <- ggplot(data = insurance_lm, 
             aes(x = bmi, y = resid)) + 
  geom_point(aes(col = smoker)) +
  geom_smooth() +
  theme(legend.position = "bottom") + 
  ylab("ordinary residuals") + 
  xlab("body mass index")

g3 <- ggplot(data = insurance_lm, 
             aes(x = factor(children), y = resid)) + 
  see::geom_violindot() + 
  xlab("number of children") + 
  ylab("ordinary residuals")
# seemingly increases with the number of children...

g4 <- ggplot(data = insurance_lm, 
             aes(x = ajustee, y = abs(rstud))) +
  geom_point() + 
  ylab("|externally studentized residuals|") + 
  xlab("fitted values")

(g1 + g2) / (g3 + g4)

g5 <- ggplot(data = insurance_lm, 
             aes(x = smobese, y = rstud)) +
  see::geom_violindot() + 
  ylab("externally studentized residuals") + 
  xlab("indicator smoker / obese")

di <- "t"
dp <- list(df = mod$df.residual)
g6 <- ggplot(data = insurance_lm, mapping = aes(sample = rstud)) +
  # stat_qq_band(distribution = di, dparams = dp, detrend = TRUE, bandType = "boot", B = 9999) +
  stat_qq_line(distribution = di, dparams = dp, detrend = TRUE) +
  stat_qq_point(distribution = di, dparams = dp, detrend = TRUE) +
  labs(x = "quantiles théoriques", 
       y = "quantiles empiriques")
# Here, since df>1000, we could use the normal approximation
# The confidence band (omitted) is weird because it is based on the sample average
# whereas the line of best fit passes through quartiles, which are robust estimators of
# location. Computing the sample mean/variance is a much WORSE approximation


car::qqPlot(rstudent(mod), 
            ylab = "externally studentized residuals",
              distribution = "t", df = mod$df.residual)

g7 <- ggplot(data = insurance_lm, mapping = aes(x = rstud)) +
  geom_density() + 
  geom_histogram(aes(y = ..density..), 
                 bins = 30, 
                 alpha = 0.5)


# Correlogram
library(forecast)
data(airpassengers, package = 'hecstatmod')
mod_ts <- lm(data = airpassengers,
   log(passengers) ~ month + year)
resid(mod_ts) %>% 
  Acf(plot = FALSE) %>% 
  autoplot + 
  labs(title = "ordinary residuals", 
         x = "lag", 
         y = "autocorrelation")