# ============================================================================
# PSYC 434 — Lab 4: Regression and Confounding Bias
# self-standing script — run from top to bottom
# ============================================================================

# --- packages ---------------------------------------------------------------

library(tidyverse)
library(ggplot2)
library(splines)
library(report)
library(performance)
library(ggeffects)
library(sjPlot)
library(dplyr)
library(skimr)
library(gtsummary)
library(kableExtra)
library(patchwork)
library(parameters)

# --- simulated height data --------------------------------------------------

set.seed(123)

# 10-person sample
draws_10 <- rnorm(10, mean = 170, sd = 20)
ggplot2::qplot(draws_10, binwidth = 2)

# 100-person sample
draws_100 <- rnorm(100, mean = 170, sd = 20)
ggplot2::qplot(draws_100, binwidth = 2)

# 10000-person sample
draws_10000 <- rnorm(1e5, mean = 170, sd = 20)
ggplot2::qplot(draws_10000, binwidth = 2)

# intercept-only models
sjPlot::tab_model(lm(draws_10 ~ 1))
sjPlot::tab_model(lm(draws_100 ~ 1))
sjPlot::tab_model(lm(draws_10000 ~ 1))

# compare all three
sjPlot::tab_model(lm(draws_10 ~ 1),
                  lm(draws_100 ~ 1),
                  lm(draws_10000 ~ 1))

# --- pearson and lee 1903 data ----------------------------------------------

df_pearson_lee <-
  data.frame(read.table(
    url(
      "https://raw.githubusercontent.com/avehtari/ROS-Examples/master/PearsonLee/data/MotherDaughterHeights.txt"
    ),
    header = TRUE
  ))

# centre mother's height
df_pearson_lee_centered <- df_pearson_lee |>
  dplyr::mutate(mother_height_c = as.numeric(scale(
    mother_height, center = TRUE, scale = FALSE
  )))

skimr::skim(df_pearson_lee_centered)

# explore the relationship
explore_md <-
  ggplot2::ggplot(data = df_pearson_lee_centered,
                  aes(y = daughter_height, x = mother_height)) +
  geom_jitter(alpha = .2) +
  labs(title = "The relationship between mother's height and daughter's height") +
  ylab("Daughter's height") +
  xlab("Mother's height") +
  theme_classic()

print(explore_md)

# --- regression: daughter height ~ mother height ----------------------------

m1 <- lm(daughter_height ~ mother_height, data = df_pearson_lee_centered)
sjPlot::tab_model(m1)

t_m1 <- parameters::model_parameters(m1, ci = 0.95)
plot(t_m1) +
  labs(title = "The relationship between mother's height and daughter's height") +
  ylab("Daughter's height")

# predicted values
predictions <- ggeffects::ggpredict(m1, terms = "mother_height",
    add.data = TRUE,
    dot.alpha = .1,
    jitter = TRUE)

plot_predictions <-
  plot(predictions) +
  theme_classic() +
  labs(title = "Predicted values of daughter's height from the Pearson/Fox 1903 dataset")

plot_predictions

# predict beyond range of data
df_expand_grid <- expand.grid(mother_height = c(25:91))

df_predict <-
  predict(m1,
          type = "response",
          interval = "confidence",
          newdata = df_expand_grid)

newdata <- data.frame(df_expand_grid, df_predict)
head(newdata)

predplot <- ggplot(data = newdata,
                   aes(x = mother_height, y = fit)) +
  geom_point() +
  geom_errorbar(aes(ymin = lwr, ymax = upr), width = .1) +
  expand_limits(x = c(20, 91), y = c(0, 81)) +
  theme_classic() +
  labs(title = "Predicted values for a broader population")

predplot

# --- non-linear relationships -----------------------------------------------

b <- c(2, 0.75)
set.seed(12)
x <- rnorm(100)
set.seed(12)
y <- rnorm(100, mean = b[1] * exp(b[2] * x))
dat1 <- data.frame(x, y)

# linear model
ot1 <- lm(y ~ x, data = dat1)
plot(ggeffects::ggpredict(ot1, terms = "x",
     add.data = TRUE, dot.alpha = .4))

# quadratic model
fit_non_linear <- lm(y ~ x + I(x^2), data = dat1)
plot(ggeffects::ggpredict(fit_non_linear, terms = "x",
     add.data = TRUE, dot.alpha = .4))

# polynomial model
fit_non_linear_b <- lm(y ~ x + poly(x, 2), data = dat1)
plot(ggeffects::ggpredict(fit_non_linear_b, terms = "x",
     add.data = TRUE, dot.alpha = .4))

# spline model
fit_non_linear_c <- lm(y ~ bs(x), data = dat1)
parameters::model_parameters(fit_non_linear_c)
plot(ggeffects::ggpredict(fit_non_linear_c, terms = "x",
     add.data = TRUE, dot.alpha = .4))

# --- centring ---------------------------------------------------------------

fit_raw <- lm(daughter_height ~ mother_height, data = df_pearson_lee_centered)
fit_centered <- lm(daughter_height ~ mother_height_c, data = df_pearson_lee_centered)
sjPlot::tab_model(fit_raw, fit_centered)

plot(ggeffects::ggpredict(fit_centered, terms = "mother_height_c",
     add.data = TRUE, dot.alpha = .4))

# --- model evaluation -------------------------------------------------------

fig_intercept_only <- lm(daughter_height ~ 1, data = df_pearson_lee)
fig_covariate <- lm(daughter_height ~ mother_height, data = df_pearson_lee)
performance::compare_performance(fig_intercept_only, fig_covariate)

BIC(fig_intercept_only) - BIC(fig_covariate)

report::report_statistics(fig_covariate)
report::report(fig_covariate)

# --- simulation 1: mediator bias --------------------------------------------

set.seed(123)

n <- 1000
p <- 0.5
alpha <- 0
beta <- 2
gamma <- 1
delta <- 1.5
sigma_L <- 1
sigma_Y <- 1.5

A <- rbinom(n, 1, p)
L <- alpha + beta * A + rnorm(n, 0, sigma_L)
Y <- gamma + delta * L + rnorm(n, 0, sigma_Y)

data <- data.frame(A = A, L = L, Y = Y)

# biased model (controls for mediator)
example_fit_1 <- lm(Y ~ A + L, data = data)

# correct model (omits mediator)
example_fit_2 <- lm(Y ~ A, data = data)

# compare tables
table1 <- gtsummary::tbl_regression(example_fit_1)
table2 <- gtsummary::tbl_regression(example_fit_2)
table_comparison <- gtsummary::tbl_merge(
  list(table1, table2),
  tab_spanner = c("Model: Wealth assumed confounder",
                  "Model: Wealth assumed to be a mediator")
)
table_comparison

# compare model fits
performance::compare_performance(example_fit_1, example_fit_2)
BIC(example_fit_1) - BIC(example_fit_2)

# --- simulation 2: collider bias --------------------------------------------

set.seed(123)

n <- 1000
p <- 0.5

A_1 <- rbinom(n, 1, p)
Y_1 <- rnorm(n, 0, 1)
L_2 <- rnorm(n, A_1 + Y_1)

data_collider <- data.frame(A = A_1, L = L_2, Y = Y_1)

# biased model (controls for collider)
collider_example_fit_1 <- lm(Y ~ A + L, data = data_collider)

# correct model
collider_example_fit_2 <- lm(Y ~ A, data = data_collider)

summary(collider_example_fit_1)

collider_table1 <- gtsummary::tbl_regression(collider_example_fit_1)
collider_table2 <- gtsummary::tbl_regression(collider_example_fit_2)
collider_table_comparison <- gtsummary::tbl_merge(
  list(collider_table1, collider_table2),
  tab_spanner = c("Model: Wealth assumed confounder",
                  "Model: Wealth not assumed to be a mediator")
)
collider_table_comparison

performance::compare_performance(collider_example_fit_1, collider_example_fit_2)
BIC(collider_example_fit_1) - BIC(collider_example_fit_2)