library(mgcv) library(gamair) library(marginaleffects) library(gratia) library(ggplot2); theme_set(theme_bw()) # Load the Canada Weather observations, taken from 35 Canadian weather stations data(canWeather) dplyr::glimpse(CanWeather) # T = mean temperature in Centigrade levels(CanWeather\$region) CanWeather\$doy <- CanWeather\$time # Plot the temperature observations as a function of day, coloured by 'region' ggplot(CanWeather, aes(x = doy, y = T, colour = region)) + geom_point(alpha = 0.2) + facet_wrap(~region) # Functional data analyses treat the observations as 'functions' or 'curves'. Treating # each station's temperature observations as a function of 'day of year', we might want to # model these using the 'scalar' covariates 'latitude' and 'region' # We can fit a function on scalar regression, where # Temperature(t) = f_r(t) + f(t)*latitude + e(t) # where e(t) is AR1 Gaussian and f_r is # a smooth for region r. # But we don't know what value to use for the AR1 parameter # (which needs to be fixed in bam() models). So we use a profile likelihood approach to # select the best-fitting aic <- reml <- rho <- seq(0.9, 0.99, by = 0.01) for(i in 1:length(rho)){ # Fit models using a grid of possible AR1 parameters b <- bam(T ~ s(region, bs = 're') + s(doy, k = 10, bs = "cr", by = region) + s(doy, k = 20, bs = "cr", by = latitude), data = CanWeather, # Tell bam() where each time series begins AR.start = time==1, rho = rho[i], # Use discrete covariates for faster fitting discrete = TRUE) # Extract AIC and -REML scores; we would like to minimise both of these aic[i] <- AIC(b) reml[i] <- b\$gcv.ubre } plot(x = rho, y = reml, pch = 16) plot(x = rho, y = aic, pch = 16) # 0.97 seems optimal; refit this model mod1 <- bam(T ~ s(region, bs = 're') + s(doy, k = 10, bs = "cr", by = region) + s(doy, k = 20, bs = "cr", by = latitude), data = CanWeather, AR.start = time==1, rho = .97, discrete = TRUE) # Plot the smooths draw(mod1) # It is difficult to understand what each region's predicted function is because # the total effect for each region is spread across multiple smooths # use plot_predictions() instead plot_predictions(mod1, by = c('doy', 'region')) plot_predictions(mod1, by = c('doy', 'region', 'region'), points = 0.1) # Obviously quite a lot of latitudinal variation; # inspect latitudinally-varying predictions for a single region plot_predictions(mod1, newdata = datagrid(region = 'Continental', latitude = unique(CanWeather\$latitude[ which(CanWeather\$region == 'Continental')]), doy = unique), by = c('doy', 'latitude')) # Check for any remaining autocorrelation in the residuals, using the standardized # residuals acf(mod1\$std) #### Bonus tasks #### # 1. Clearly there is still some autocorrelation left at lag 2. If you have # mvgam installed, consider a model that will deal with this directly # (but note, this model will be SLOW) library(mvgam) CanWeather\$series <- CanWeather\$place mod2 <- mvgam( # Use a State-Space model and change 'region' to a parametric # effect to speed up sampling formula = T ~ 1, trend_formula = ~ region + s(doy, k = 10, bs = "cr", by = region) + s(doy, k = 20, bs = "cr", by = latitude), data = CanWeather, trend_model = AR(p = 2), # Use informative priors on AR and sigma pars to speed up sampling priors = c(prior(normal(0.8, 0.1), ub = 1, lb = 0, class = ar1), prior(normal(0.2, 0.5), ub = 1, lb = -1, class = ar2), prior(exponential(5), class = sigma_obs), prior(exponential(2), class = sigma)), # Shared Gaussian observation error for all series family = gaussian(), share_obs_params = TRUE, return_model_data = TRUE) summary(mod2, include_betas = FALSE) plot(mod2, type = 'smooths', trend_effects = TRUE) plot_predictions(mod2, by = c('doy', 'region'), type = 'expected') plot_predictions(mod2, newdata = datagrid(region = 'Continental', latitude = unique(CanWeather\$latitude[ which(CanWeather\$region == 'Continental')]), doy = unique), by = c('doy', 'latitude'))