# ==============================================================================
# simulation script: population estimates and causal inference
# psyc 434 — causal inference for psychological science (2026)
# adapted from the 2025 sasp causal inference workshop
# ==============================================================================

# load required libraries --------------------------------------------------
library(tidyverse)
library(stdReg)
library(gtsummary)
library(clarify)
library(grf)

# ==============================================================================
# helper: simulate ate data with weights
# (replaces margot::simulate_ate_data_with_weights)
# ==============================================================================

simulate_ate_data_with_weights <- function(n_sample = 10000,
                                           n_population = 100000,
                                           p_z_sample = 0.1,
                                           p_z_population = 0.5,
                                           beta_a = 1,
                                           beta_z = 2.5,
                                           noise_sd = 0.5) {
  # sample data
  z_sample <- rbinom(n_sample, 1, p_z_sample)
  a_sample <- rbinom(n_sample, 1, 0.5)
  y_sample <- beta_a * a_sample + beta_z * a_sample * z_sample +
    rnorm(n_sample, 0, noise_sd)

  # population data
  z_population <- rbinom(n_population, 1, p_z_population)
  a_population <- rbinom(n_population, 1, 0.5)
  y_population <- beta_a * a_population + beta_z * a_population * z_population +
    rnorm(n_population, 0, noise_sd)

  # compute weights (inverse probability of being in the sample given z)
  p_sample_z1 <- (n_sample * p_z_sample) /
    (n_sample * p_z_sample + n_population * p_z_population)
  p_sample_z0 <- (n_sample * (1 - p_z_sample)) /
    (n_sample * (1 - p_z_sample) + n_population * (1 - p_z_population))

  weights <- ifelse(z_sample == 1, 1 / p_sample_z1, 1 / p_sample_z0)
  weights <- weights / mean(weights) # normalise

  sample_data <- data.frame(
    y_sample = y_sample,
    a_sample = a_sample,
    z_sample = z_sample,
    weights = weights
  )

  population_data <- data.frame(
    y_population = y_population,
    a_population = a_population,
    z_population = z_population
  )

  list(sample_data = sample_data, population_data = population_data)
}

# ==============================================================================
# s2/s3: generalisability and transportability simulation
# ==============================================================================

# simulate data with different distributions of effect modifiers
# between sample and population
set.seed(123)

data <- simulate_ate_data_with_weights(
  n_sample = 10000,
  n_population = 100000,
  p_z_sample = 0.1,
  p_z_population = 0.5,
  beta_a = 1,
  beta_z = 2.5,
  noise_sd = 0.5
)

# extract sample and population data
sample_data <- data$sample_data
population_data <- data$population_data

# check imbalance in effect modifier distribution
cat("\nsample distribution of effect modifier:\n")
print(table(sample_data$z_sample))

cat("\npopulation distribution of effect modifier:\n")
print(table(population_data$z_population))

# model coefficients: sample
model_sample <- glm(y_sample ~ a_sample * z_sample,
  data = sample_data
)

cat("\n=== model coefficients: sample ===\n")
print(summary(model_sample)$coefficients)

# model coefficients: weighted sample
model_weighted_sample <- glm(y_sample ~ a_sample * z_sample,
  data = sample_data, weights = weights
)

cat("\n=== model coefficients: weighted sample ===\n")
print(summary(model_weighted_sample)$coefficients)

# model coefficients: population
model_population <- glm(y_population ~ a_population * z_population,
  data = population_data
)

cat("\n=== model coefficients: population ===\n")
print(summary(model_population)$coefficients)

# marginal effect estimates using stdreg
# sample ate
fit_std_sample <- stdReg::stdGlm(model_sample,
  data = sample_data, X = "a_sample"
)

cat("\n=== sample ate ===\n")
print(summary(fit_std_sample, contrast = "difference", reference = 0))

# population ate (oracle)
fit_std_population <- stdReg::stdGlm(model_population,
  data = population_data, X = "a_population"
)

cat("\n=== population ate (oracle) ===\n")
print(summary(fit_std_population, contrast = "difference", reference = 0))

# weighted sample ate
fit_std_weighted_sample_weights <- stdReg::stdGlm(model_weighted_sample,
  data = sample_data, X = "a_sample"
)

cat("\n=== weighted sample ate ===\n")
print(summary(fit_std_weighted_sample_weights, contrast = "difference", reference = 0))

# manual ate calculation using predict function ----------------------------
cat("\n=== manual ate calculation using predict function ===\n")
cat("this section explicitly shows what stdReg is doing under the hood\n\n")

# create counterfactual datasets where everyone gets a = 0 and a = 1
sample_data_a0 <- sample_data |> mutate(a_sample = 0)
sample_data_a1 <- sample_data |> mutate(a_sample = 1)

# predict outcomes under each treatment level
y_pred_a0_sample <- predict(model_sample, newdata = sample_data_a0)
y_pred_a1_sample <- predict(model_sample, newdata = sample_data_a1)

# calculate ate as mean difference in predicted outcomes
ate_sample_manual <- mean(y_pred_a1_sample - y_pred_a0_sample)

cat("sample ate (manual calculation):", round(ate_sample_manual, 3), "\n")

# population ate using predict
population_data_a0 <- population_data |> mutate(a_population = 0)
population_data_a1 <- population_data |> mutate(a_population = 1)

y_pred_a0_population <- predict(model_population, newdata = population_data_a0)
y_pred_a1_population <- predict(model_population, newdata = population_data_a1)

ate_population_manual <- mean(y_pred_a1_population - y_pred_a0_population)

cat("population ate (manual calculation):", round(ate_population_manual, 3), "\n")

# weighted sample ate using predict
y_pred_a0_weighted <- predict(model_weighted_sample, newdata = sample_data_a0)
y_pred_a1_weighted <- predict(model_weighted_sample, newdata = sample_data_a1)

ate_weighted_sample_manual <- mean(y_pred_a1_weighted - y_pred_a0_weighted)

cat("weighted sample ate (manual calculation):", round(ate_weighted_sample_manual, 3), "\n")

# key insight
cat("\n=== key insight ===\n")
cat("1. model coefficients are nearly identical across sample, weighted sample, and population\n")
cat(
  "2. but the marginal ates differ between sample (", round(ate_sample_manual, 3),
  ") and population (", round(ate_population_manual, 3), ")\n"
)
cat("3. weighting corrects this, giving ate = ", round(ate_weighted_sample_manual, 3), "\n")
cat("4. this happens because the distribution of the effect modifier differs\n")
cat("   sample: z=1 is rare; population: z=1 is common\n\n")

# ==============================================================================
# s4: cross-sectional data ambiguity simulation
# ==============================================================================

# simulate data where l is a mediator between a and y
set.seed(123)

n <- 1000
p <- 0.5
alpha <- 0
beta <- 2
gamma <- 1
delta <- 1.5
sigma_L <- 1
sigma_Y <- 1.5

# simulate the data: fully mediated effect by l
A <- rbinom(n, 1, p)
L <- alpha + beta * A + rnorm(n, 0, sigma_L)
Y <- gamma + delta * L + rnorm(n, 0, sigma_Y)

# create data frame
data_cross <- data.frame(A = A, L = L, Y = Y)

# fit regression models
# model 1: control for l (assuming l is confounder)
fit_1 <- lm(Y ~ A + L, data = data_cross)

# model 2: omit l (assuming l is mediator)
fit_2 <- lm(Y ~ A, data = data_cross)

# create comparison table
table1 <- gtsummary::tbl_regression(fit_1)
table2 <- gtsummary::tbl_regression(fit_2)

table_comparison <- gtsummary::tbl_merge(
  list(table1, table2),
  tab_spanner = c(
    "Model: L assumed confounder",
    "Model: L assumed mediator"
  )
)

cat("\n=== cross-sectional model comparison ===\n")
print(table_comparison)

# calculate ate using clarify
set.seed(2025)
sim_coefs_fit_1 <- sim(fit_1)
sim_coefs_fit_2 <- sim(fit_2)

sim_est_fit_1 <- sim_ame(
  sim_coefs_fit_1,
  var = "A",
  subset = A == 1,
  contrast = "RD",
  verbose = FALSE
)

sim_est_fit_2 <- sim_ame(
  sim_coefs_fit_2,
  var = "A",
  subset = A == 1,
  contrast = "RD",
  verbose = FALSE
)

summary_sim_est_fit_1 <- summary(sim_est_fit_1, null = c(`RD` = 0))
summary_sim_est_fit_2 <- summary(sim_est_fit_2, null = c(`RD` = 0))

cat("\n=== ate estimates ===\n")
cat(
  "model 1 (l as confounder): ate =",
  round(summary_sim_est_fit_1[3, 1], 2), "\n"
)
cat(
  "model 2 (l as mediator): ate =",
  round(summary_sim_est_fit_2[3, 1], 2), "\n"
)

# ==============================================================================
# appendix d: confounding control strategies simulation
# ==============================================================================

set.seed(123)
n <- 10000

# baseline covariates
U <- rnorm(n)
A_0 <- rbinom(n, 1, prob = plogis(U))
Y_0 <- rnorm(n, mean = U, sd = 1)
L_0 <- rnorm(n, mean = U, sd = 1)

# coefficients for treatment assignment
beta_A0 <- 0.25
beta_Y0 <- 0.3
beta_L0 <- 0.2
beta_U <- 0.1

# simulate treatment assignment
A_1 <- rbinom(n, 1, prob = plogis(
  -0.5 +
    beta_A0 * A_0 +
    beta_Y0 * Y_0 +
    beta_L0 * L_0 +
    beta_U * U
))

# coefficients for continuous outcome
delta_A1 <- 0.3
delta_Y0 <- 0.9
delta_A0 <- 0.1
delta_L0 <- 0.3
theta_A0Y0L0 <- 0.5
delta_U <- 0.05

# simulate continuous outcome
Y_2 <- rnorm(n,
  mean = 0 +
    delta_A1 * A_1 +
    delta_Y0 * Y_0 +
    delta_A0 * A_0 +
    delta_L0 * L_0 +
    theta_A0Y0L0 * Y_0 * A_0 * L_0 +
    delta_U * U,
  sd = .5
)

# create data frame
data_confound <- data.frame(Y_2, A_0, A_1, L_0, Y_0, U)

# fit models
fit_no_control <- lm(Y_2 ~ A_1, data = data_confound)
fit_standard <- lm(Y_2 ~ A_1 + L_0, data = data_confound)
fit_interaction <- lm(Y_2 ~ A_1 * (L_0 + A_0 + Y_0), data = data_confound)

# create gtsummary tables
tbl_fit_no_control <- tbl_regression(fit_no_control)
tbl_fit_standard <- tbl_regression(fit_standard)
tbl_fit_interaction <- tbl_regression(fit_interaction)

# filter to show only treatment variable
tbl_list_modified <- lapply(
  list(tbl_fit_no_control, tbl_fit_standard, tbl_fit_interaction),
  function(tbl) {
    tbl |>
      modify_table_body(~ .x |> dplyr::filter(variable == "A_1"))
  }
)

# merge tables
table_comparison_confound <- tbl_merge(
  tbls = tbl_list_modified,
  tab_spanner = c("No Control", "Standard", "Interaction")
)

cat("\n=== confounding control strategy comparison ===\n")
print(table_comparison_confound)

# calculate ate using clarify
set.seed(123)
sim_coefs_fit_no_control <- sim(fit_no_control)
sim_coefs_fit_std <- sim(fit_standard)
sim_coefs_fit_int <- sim(fit_interaction)

sim_est_fit_no_control <- sim_ame(
  sim_coefs_fit_no_control,
  var = "A_1",
  subset = A_1 == 1,
  contrast = "RD",
  verbose = FALSE
)

sim_est_fit_std <- sim_ame(
  sim_coefs_fit_std,
  var = "A_1",
  subset = A_1 == 1,
  contrast = "RD",
  verbose = FALSE
)

sim_est_fit_int <- sim_ame(
  sim_coefs_fit_int,
  var = "A_1",
  subset = A_1 == 1,
  contrast = "RD",
  verbose = FALSE
)

summary_sim_coefs_fit_no_control <- summary(sim_est_fit_no_control, null = c(`RD` = 0))
summary_sim_est_fit_std <- summary(sim_est_fit_std, null = c(`RD` = 0))
summary_sim_est_fit_int <- summary(sim_est_fit_int, null = c(`RD` = 0))

cat("\n=== ate estimates for confounding control strategies ===\n")
cat(
  "no control: ate =",
  round(summary_sim_coefs_fit_no_control[3, 1], 2), "\n"
)
cat(
  "standard control: ate =",
  round(summary_sim_est_fit_std[3, 1], 2), "\n"
)
cat(
  "interaction model: ate =",
  round(summary_sim_est_fit_int[3, 1], 2), "\n"
)

# ==============================================================================
# appendix e: causal forest estimation
# ==============================================================================

# prepare data for causal forest
W <- as.matrix(data_confound$A_1)
Y <- as.matrix(data_confound$Y_2)
X <- as.matrix(data_confound[, c("L_0", "A_0", "Y_0")])

# fit causal forest model
fit_causal_forest <- causal_forest(X, Y, W)

# estimate average treatment effect
ate_cf <- average_treatment_effect(fit_causal_forest)
ate_cf_df <- data.frame(ate_cf)

cat("\n=== causal forest ate estimate ===\n")
cat(
  "ate =", round(ate_cf_df[1, 1], 2),
  ", se =", round(ate_cf_df[2, 1], 2), "\n"
)

# ==============================================================================
# end of script
# ==============================================================================

cat("\n=== all simulations completed successfully ===\n")