# ============================================================================ # PSYC 434 — Lab 4: Regression and Confounding Bias # self-standing script — run from top to bottom # ============================================================================ # --- packages --------------------------------------------------------------- library(tidyverse) library(ggplot2) library(splines) library(report) library(performance) library(ggeffects) library(sjPlot) library(dplyr) library(skimr) library(gtsummary) library(kableExtra) library(patchwork) library(parameters) # --- simulated height data -------------------------------------------------- set.seed(123) # 10-person sample draws_10 <- rnorm(10, mean = 170, sd = 20) ggplot2::qplot(draws_10, binwidth = 2) # 100-person sample draws_100 <- rnorm(100, mean = 170, sd = 20) ggplot2::qplot(draws_100, binwidth = 2) # 10000-person sample draws_10000 <- rnorm(1e5, mean = 170, sd = 20) ggplot2::qplot(draws_10000, binwidth = 2) # intercept-only models sjPlot::tab_model(lm(draws_10 ~ 1)) sjPlot::tab_model(lm(draws_100 ~ 1)) sjPlot::tab_model(lm(draws_10000 ~ 1)) # compare all three sjPlot::tab_model(lm(draws_10 ~ 1), lm(draws_100 ~ 1), lm(draws_10000 ~ 1)) # --- pearson and lee 1903 data ---------------------------------------------- df_pearson_lee <- data.frame(read.table( url( "https://raw.githubusercontent.com/avehtari/ROS-Examples/master/PearsonLee/data/MotherDaughterHeights.txt" ), header = TRUE )) # centre mother's height df_pearson_lee_centered <- df_pearson_lee |> dplyr::mutate(mother_height_c = as.numeric(scale( mother_height, center = TRUE, scale = FALSE ))) skimr::skim(df_pearson_lee_centered) # explore the relationship explore_md <- ggplot2::ggplot(data = df_pearson_lee_centered, aes(y = daughter_height, x = mother_height)) + geom_jitter(alpha = .2) + labs(title = "The relationship between mother's height and daughter's height") + ylab("Daughter's height") + xlab("Mother's height") + theme_classic() print(explore_md) # --- regression: daughter height ~ mother height ---------------------------- m1 <- lm(daughter_height ~ mother_height, data = df_pearson_lee_centered) sjPlot::tab_model(m1) t_m1 <- parameters::model_parameters(m1, ci = 0.95) plot(t_m1) + labs(title = "The relationship between mother's height and daughter's height") + ylab("Daughter's height") # predicted values predictions <- ggeffects::ggpredict(m1, terms = "mother_height", add.data = TRUE, dot.alpha = .1, jitter = TRUE) plot_predictions <- plot(predictions) + theme_classic() + labs(title = "Predicted values of daughter's height from the Pearson/Fox 1903 dataset") plot_predictions # predict beyond range of data df_expand_grid <- expand.grid(mother_height = c(25:91)) df_predict <- predict(m1, type = "response", interval = "confidence", newdata = df_expand_grid) newdata <- data.frame(df_expand_grid, df_predict) head(newdata) predplot <- ggplot(data = newdata, aes(x = mother_height, y = fit)) + geom_point() + geom_errorbar(aes(ymin = lwr, ymax = upr), width = .1) + expand_limits(x = c(20, 91), y = c(0, 81)) + theme_classic() + labs(title = "Predicted values for a broader population") predplot # --- non-linear relationships ----------------------------------------------- b <- c(2, 0.75) set.seed(12) x <- rnorm(100) set.seed(12) y <- rnorm(100, mean = b[1] * exp(b[2] * x)) dat1 <- data.frame(x, y) # linear model ot1 <- lm(y ~ x, data = dat1) plot(ggeffects::ggpredict(ot1, terms = "x", add.data = TRUE, dot.alpha = .4)) # quadratic model fit_non_linear <- lm(y ~ x + I(x^2), data = dat1) plot(ggeffects::ggpredict(fit_non_linear, terms = "x", add.data = TRUE, dot.alpha = .4)) # polynomial model fit_non_linear_b <- lm(y ~ x + poly(x, 2), data = dat1) plot(ggeffects::ggpredict(fit_non_linear_b, terms = "x", add.data = TRUE, dot.alpha = .4)) # spline model fit_non_linear_c <- lm(y ~ bs(x), data = dat1) parameters::model_parameters(fit_non_linear_c) plot(ggeffects::ggpredict(fit_non_linear_c, terms = "x", add.data = TRUE, dot.alpha = .4)) # --- centring --------------------------------------------------------------- fit_raw <- lm(daughter_height ~ mother_height, data = df_pearson_lee_centered) fit_centered <- lm(daughter_height ~ mother_height_c, data = df_pearson_lee_centered) sjPlot::tab_model(fit_raw, fit_centered) plot(ggeffects::ggpredict(fit_centered, terms = "mother_height_c", add.data = TRUE, dot.alpha = .4)) # --- model evaluation ------------------------------------------------------- fig_intercept_only <- lm(daughter_height ~ 1, data = df_pearson_lee) fig_covariate <- lm(daughter_height ~ mother_height, data = df_pearson_lee) performance::compare_performance(fig_intercept_only, fig_covariate) BIC(fig_intercept_only) - BIC(fig_covariate) report::report_statistics(fig_covariate) report::report(fig_covariate) # --- simulation 1: mediator bias -------------------------------------------- set.seed(123) n <- 1000 p <- 0.5 alpha <- 0 beta <- 2 gamma <- 1 delta <- 1.5 sigma_L <- 1 sigma_Y <- 1.5 A <- rbinom(n, 1, p) L <- alpha + beta * A + rnorm(n, 0, sigma_L) Y <- gamma + delta * L + rnorm(n, 0, sigma_Y) data <- data.frame(A = A, L = L, Y = Y) # biased model (controls for mediator) example_fit_1 <- lm(Y ~ A + L, data = data) # correct model (omits mediator) example_fit_2 <- lm(Y ~ A, data = data) # compare tables table1 <- gtsummary::tbl_regression(example_fit_1) table2 <- gtsummary::tbl_regression(example_fit_2) table_comparison <- gtsummary::tbl_merge( list(table1, table2), tab_spanner = c("Model: Wealth assumed confounder", "Model: Wealth assumed to be a mediator") ) table_comparison # compare model fits performance::compare_performance(example_fit_1, example_fit_2) BIC(example_fit_1) - BIC(example_fit_2) # --- simulation 2: collider bias -------------------------------------------- set.seed(123) n <- 1000 p <- 0.5 A_1 <- rbinom(n, 1, p) Y_1 <- rnorm(n, 0, 1) L_2 <- rnorm(n, A_1 + Y_1) data_collider <- data.frame(A = A_1, L = L_2, Y = Y_1) # biased model (controls for collider) collider_example_fit_1 <- lm(Y ~ A + L, data = data_collider) # correct model collider_example_fit_2 <- lm(Y ~ A, data = data_collider) summary(collider_example_fit_1) collider_table1 <- gtsummary::tbl_regression(collider_example_fit_1) collider_table2 <- gtsummary::tbl_regression(collider_example_fit_2) collider_table_comparison <- gtsummary::tbl_merge( list(collider_table1, collider_table2), tab_spanner = c("Model: Wealth assumed confounder", "Model: Wealth not assumed to be a mediator") ) collider_table_comparison performance::compare_performance(collider_example_fit_1, collider_example_fit_2) BIC(collider_example_fit_1) - BIC(collider_example_fit_2)