library(mgcv) library(marginaleffects) library(dplyr) library(gratia) library(ggplot2); theme_set(theme_bw()) # Load the annual American kestrel abundance time series in # taken in British Columbia, Canada. These data have been collected # annually, corrected for changes in observer coverage and detectability, # and logged. They can be found in the MARSS package load(url('https://github.com/atsa-es/MARSS/raw/master/data/kestrel.rda')) View(kestrel) # Arrange the data into a data.frame regions <- c("BC", "Alb", "Sask") model_data <- do.call(rbind, lapply(seq_along(regions), function(x){ data.frame(year = kestrel[,1], logcount = kestrel[,1 + x], region = regions[x]) })) View(model_data) # Factors for year and region model_data %>% dplyr::mutate(yearfac = as.factor(year), region = as.factor(region)) -> model_data # Inspect the data structure and dimensions dplyr::glimpse(model_data) levels(model_data$yearfac) levels(model_data$region) # Plot the time series ggplot(model_data, aes(x = year, y = logcount)) + geom_point() + geom_line() + facet_wrap(~region) # The adjusted logcount never goes below zero ggplot(model_data, aes(logcount)) + geom_histogram(col = 'white') summary(model_data$logcount) # Q. What kind of observation family is appropriate? ?family ?mgcv::family.mgcv # Maybe a Gamma? # Random effects in mgcv use the 're' basis ?mgcv::smooth.construct.re.smooth.spec # A model with yearly random intercepts mod <- gam(logcount ~ s(yearfac, bs = 're'), data = model_data, family = Gamma(link = 'inverse'), method = 'REML') # Oops; turns but it does contain actual zeros any(model_data$logcount == 0L) # A Tweedie model may be best here mod <- gam(logcount ~ s(yearfac, bs = 're'), data = model_data, family = Tweedie(p = 1.5), method = 'REML') # Random effect variance component gam.vcomp(mod) # Summary summary(mod) # Residual diagnostics (using gratia) appraise(mod) # Looks poor! Why? Look at the random effect estimates plot_predictions(mod, condition = 'yearfac') # Look at residuals against region? resids <- residuals(mod) ggplot(data.frame(model_data %>% dplyr::bind_cols(data.frame(resids))), aes(x = year, y = resids)) + geom_point() + geom_hline(yintercept = 0, linetype = 'dashed') + facet_wrap(~region) # Aha! Need to add region as a factor as well mod2 <- gam(logcount ~ s(yearfac, bs = 're') + s(region, bs = 're'), data = model_data, family = Tweedie(p = 1.5), method = 'REML') gam.vcomp(mod2) summary(mod2) appraise(mod2) # Look at residuals again resids <- residuals(mod2) ggplot(data.frame(model_data %>% dplyr::bind_cols(data.frame(resids))), aes(x = year, y = resids)) + geom_point() + geom_hline(yintercept = 0, linetype = 'dashed') + facet_wrap(~region) # Better, but still obvious problems # (the trend is not the same per region, how can we model this?) #### Bonus tasks #### # 1. Calculate residual ACF and pACF functions for each region using # the residuals from mod2. Hint, you can add residuals to the # data object as we've done above # 2. Look at the help file on random effects in mgcv and work out # how to fit a model that includes random slopes of year. It'll be # best to scale the year variable beforehand for easier computation model_data %>% dplyr::mutate(year_scaled = as.vector(scale(year))) -> model_data # 3. Look at residuals against region for this model, and use # plot_predictions() to show the predicted linear year functions # for each region # 4. Compare the model with random slopes against mod2 using a # Generalized Likelihood Ratio test (hint, see anova.gam() for help) ?anova.gam