library(mgcv)
library(marginaleffects)
library(dplyr)
library(gratia)
library(ggplot2); theme_set(theme_bw())

# Load the annual American kestrel abundance time series in 
# taken in British Columbia, Canada. These data have been collected 
# annually, corrected for changes in observer coverage and detectability,
# and logged. They can be found in the MARSS package
load(url('https://github.com/atsa-es/MARSS/raw/master/data/kestrel.rda'))
View(kestrel)

# Arrange the data into a data.frame
regions <- c("BC", "Alb", "Sask")
model_data <- do.call(rbind, lapply(seq_along(regions), function(x){
  data.frame(year = kestrel[,1],
             logcount = kestrel[,1 + x],
             region = regions[x])
}))
View(model_data)

# Factors for year and region
model_data %>%
  dplyr::mutate(yearfac = as.factor(year),
                region = as.factor(region)) -> model_data

# Inspect the data structure and dimensions
dplyr::glimpse(model_data)
levels(model_data$yearfac)
levels(model_data$region)

# Plot the time series
ggplot(model_data, aes(x = year, y = logcount)) + 
  geom_point() + 
  geom_line() +
  facet_wrap(~region)

# The adjusted logcount never goes below zero
ggplot(model_data, aes(logcount)) +
  geom_histogram(col = 'white')
summary(model_data$logcount)

# Q. What kind of observation family is appropriate?
?family
?mgcv::family.mgcv

# Maybe a Gamma?
# Random effects in mgcv use the 're' basis
?mgcv::smooth.construct.re.smooth.spec

# A model with yearly random intercepts
mod <- gam(logcount ~ s(yearfac, bs = 're'),
           data = model_data,
           family = Gamma(link = 'inverse'),
           method = 'REML')

# Oops; turns but it does contain actual zeros
any(model_data$logcount == 0L)

# A Tweedie model may be best here
mod <- gam(logcount ~ s(yearfac, bs = 're'),
           data = model_data,
           family = Tweedie(p = 1.5),
           method = 'REML')

# Random effect variance component
gam.vcomp(mod)

# Summary
summary(mod)

# Residual diagnostics (using gratia)
appraise(mod)

# Looks poor! Why? Look at the random effect estimates
plot_predictions(mod, condition = 'yearfac')

# Look at residuals against region?
resids <- residuals(mod)
ggplot(data.frame(model_data %>%
                    dplyr::bind_cols(data.frame(resids))),
       aes(x = year, y = resids)) +
  geom_point() + 
  geom_hline(yintercept = 0, linetype = 'dashed') +
  facet_wrap(~region)

# Aha! Need to add region as a factor as well
mod2 <- gam(logcount ~ s(yearfac, bs = 're') +
              s(region, bs = 're'),
            data = model_data,
            family = Tweedie(p = 1.5),
            method = 'REML')
gam.vcomp(mod2)
summary(mod2)
appraise(mod2)

# Look at residuals again
resids <- residuals(mod2)
ggplot(data.frame(model_data %>%
                    dplyr::bind_cols(data.frame(resids))),
       aes(x = year, y = resids)) +
  geom_point() + 
  geom_hline(yintercept = 0, linetype = 'dashed') +
  facet_wrap(~region)

# Better, but still obvious problems 
# (the trend is not the same per region, how can we model this?)

#### Bonus tasks ####
# 1. Calculate residual ACF and pACF functions for each region using
#    the residuals from mod2. Hint, you can add residuals to the 
#    data object as we've done above

# 2. Look at the help file on random effects in mgcv and work out
#    how to fit a model that includes random slopes of year. It'll be
#    best to scale the year variable beforehand for easier computation
model_data %>%
  dplyr::mutate(year_scaled = as.vector(scale(year))) -> model_data

# 3. Look at residuals against region for this model, and use
#    plot_predictions() to show the predicted linear year functions
#    for each region

# 4. Compare the model with random slopes against mod2 using a 
#    Generalized Likelihood Ratio test (hint, see anova.gam() for help)
?anova.gam