library(mgcv) library(marginaleffects) library(dplyr) library(gratia) library(ggplot2); theme_set(theme_bw()) # Simulate some data in which multiple covariates may impact the conditional # response in nonlinear ways set.seed(0) simdat <- gamSim() dplyr::glimpse(simdat) # Plot the data and fit some simple smooths ggplot(simdat, aes(x = x0, y = y)) + geom_point() + geom_smooth(fill = 'darkred', col = 'darkred') ggplot(simdat, aes(x = x1, y = y)) + geom_point() + geom_smooth(fill = 'darkred', col = 'darkred') ggplot(simdat, aes(x = x2, y = y)) + geom_point() + geom_smooth(fill = 'darkred', col = 'darkred') ggplot(simdat, aes(x = x3, y = y)) + geom_point() + geom_smooth(fill = 'darkred', col = 'darkred') # The outcome is unbounded continuous (Gaussian appropriate?) ggplot(simdat, aes(y)) + geom_histogram(col = 'white') summary(simdat\$y) # We've seen smooths in one dimension so far; just to refresh # Thin plate basis (default in unidimensional smooths) basis(s(x0), data = simdat) %>% draw() # Cubic regression basis basis(s(x0, bs = 'cr'), data = simdat) %>% draw() # Changing k basis(s(x0, bs = 'cr', k = 20), data = simdat) %>% draw() # B-splines with first-derivative penalty basis(s(x0, bs = 'bs', m = 1), data = simdat) %>% draw() # But how to include interactions in GAMs? # Option 1: including multiple terms in s() # This assumes all covariates wrapped in s() are on the same scale and # that smoothness should be the same in all dimensions mod1 <- gam(y ~ s(x0, x1, k = 100, bs = 'tp'), data = simdat, family = gaussian(), method = 'REML') summary(mod1) # Plot the interaction using gratia draw(mod1) # Now using targeted visuals with marginaleffects' plot_predictions(); # use custom functions to determinehow the underlying data_grid() will # be set up. This can be useful when plotting many effects summary_round = function(x){ round(fivenum(x, na.rm = TRUE), 4) } seq_even = function(x){ seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE), length.out = 100) } plot_predictions(mod1, by = c('x0', 'x1'), newdata = datagrid(model = mod1, x0 = seq(min(simdat\$x0), max(simdat\$x0), length.out = 50), x1 = summary_round)) plot_predictions(mod1, by = c('x1', 'x0'), newdata = datagrid(model = mod1, x1 = seq(min(simdat\$x1), max(simdat\$x1), length.out = 50), x0 = summary_round)) # Option 2: including multiple terms in te() # This does not assume covariates are on the same scale, or that # smoothness should be isotropic. It is much more flexible and is therefore # preferred ?mgcv::gam.models ?mgcv::te mod2 <- gam(y ~ te(x0, x1, k = c(10, 10), bs = c('tp', 'tp')), data = simdat, family = gaussian(), method = 'REML') summary(mod2) # Plot again draw(mod2) plot_predictions(mod2, by = c('x0', 'x1'), newdata = datagrid(model = mod1, x0 = seq(min(simdat\$x0), max(simdat\$x0), length.out = 50), x1 = summary_round)) plot_predictions(mod2, by = c('x1', 'x0'), newdata = datagrid(model = mod1, x1 = seq(min(simdat\$x1), max(simdat\$x1), length.out = 50), x0 = summary_round)) # Option 3: ANOVA decomposition with te() and ti() # Same assumptions as with te() but this allows the 'main' effects # of the covariates to be isolated from the 'interaction' effects mod3 <- gam(y ~ s(x0, k = 10, bs = 'tp') + s(x1, k = 10, bs = 'tp') + ti(x0, x1, k = c(10, 10), bs = c('tp', 'tp')), data = simdat, family = gaussian(), method = 'REML') summary(mod3) # Drawing these functions can be very useful but also can be confusing; # i.e. the total effect of x0 is now spread among two different terms draw(mod3) # plot_predictions() shows only the 'total' effects, as before plot_predictions(mod3, by = c('x0', 'x1'), newdata = datagrid(model = mod1, x0 = seq(min(simdat\$x0), max(simdat\$x0), length.out = 50), x1 = summary_round)) plot_predictions(mod3, by = c('x1', 'x0'), newdata = datagrid(model = mod1, x1 = seq(min(simdat\$x1), max(simdat\$x1), length.out = 50), x0 = summary_round)) # mod2 and mod3 are roughly equivalent, but mod3 is much more flexible and # can allow you to accommodate highly complex models with many interactions # For example, say we want x0:x1 and x1:x2 interactions, but we don't want # the full three way interaction x0:x1:x2 mod4 <- gam(y ~ # main, lower-order effects s(x0, k = 10, bs = 'tp') + s(x1, k = 10, bs = 'tp') + s(x2, k = 10, bs = 'tp') + # second-level interaction effects (notice how x1 is allowed # to be in both of these) ti(x0, x1, k = c(10, 10), bs = c('tp', 'tp')) + ti(x1, x2, k = c(10, 10), bs = c('tp', 'tp')), data = simdat, family = gaussian(), method = 'REML', # impose extra penalties to help regularize smooths toward # straight lines select = TRUE) # Interactions needed? summary(mod4) # Now draw() gets even more difficult to interpret draw(mod4) # Some more targeted plots with plot_predictions() plot_predictions(mod4, by = c('x0', 'x1', 'x2'), newdata = datagrid(x0 = seq_even, x1 = summary_round, x2 = summary_round)) plot_predictions(mod4, by = c('x1', 'x0', 'x2'), newdata = datagrid(x1 = seq_even, x0 = summary_round, x2 = summary_round)) # Interactions can also be formed between a multidimensional smooth and # another effect (i.e. if you want a spatial smooth to interact with a smooth # of time) mod5 <- gam(y ~ te(x0, x1, x2, # d tells te() that we want a 2-dimensional smooth # of x0 and x1 to interact with a 1-dimensional # smooth of x2 d = c(2, 1), # basis types and k values for the two smooths bs = c('cr', 'cr'), k = c(25, 10)), data = simdat, family = gaussian(), method = 'REML') # Diagnostics and some plots summary(mod5) appraise(mod5, method = 'simulate') draw(mod5) plot_predictions(mod5, by = c('x0', 'x1', 'x2'), newdata = datagrid(x0 = seq_even, x1 = summary_round, x2 = summary_round)) # Model comparisons anova(mod1, mod2, mod3, mod4, mod5, test = 'Chisq') #### Bonus tasks #### # 1. Plot residuals of mod4 against each predictor variable