# ============================================================================
# PSYC 434 — Lab 5: Average Treatment Effects
# self-standing script — run from top to bottom
# ============================================================================

# --- packages ---------------------------------------------------------------

# install causalworkshop if needed:
# install.packages("remotes")
# remotes::install_github("go-bayes/causalworkshop")

library(causalworkshop)
library(grf)
library(tidyverse)

# --- simulate data ----------------------------------------------------------

d <- simulate_nzavs_data(n = 5000, seed = 2026)
dim(d)
names(d)

# separate waves
d0 <- d |> filter(wave == 0)  # baseline confounders
d1 <- d |> filter(wave == 1)  # exposure assignment
d2 <- d |> filter(wave == 2)  # outcomes

stopifnot(all(d0$id == d1$id), all(d0$id == d2$id))

# ground truth
true_ate <- mean(d0$tau_community_wellbeing)
cat("True ATE:", round(true_ate, 3), "\n")

# --- naive ATE (biased) ----------------------------------------------------

fit_naive <- lm(d2$wellbeing ~ d1$community_group)
naive_ate <- coef(fit_naive)[2]
cat("Naive ATE:", round(naive_ate, 3), "\n")
cat("True ATE: ", round(true_ate, 3), "\n")
cat("Bias:     ", round(naive_ate - true_ate, 3), "\n")

# --- adjusted ATE (regression) ---------------------------------------------

df <- data.frame(
  y = d2$wellbeing,
  a = d1$community_group,
  age = d0$age,
  male = d0$male,
  nz_european = d0$nz_european,
  education = d0$education,
  partner = d0$partner,
  employed = d0$employed,
  log_income = d0$log_income,
  nz_dep = d0$nz_dep,
  agreeableness = d0$agreeableness,
  conscientiousness = d0$conscientiousness,
  extraversion = d0$extraversion,
  neuroticism = d0$neuroticism,
  openness = d0$openness,
  community_t0 = d0$community_group,
  wellbeing_t0 = d0$wellbeing
)

fit_adj <- lm(y ~ a + age + male + nz_european + education + partner +
                employed + log_income + nz_dep + agreeableness +
                conscientiousness + extraversion + neuroticism + openness +
                community_t0 + wellbeing_t0, data = df)

adj_ate <- coef(fit_adj)["a"]
cat("Adjusted ATE:", round(adj_ate, 3), "\n")
cat("True ATE:    ", round(true_ate, 3), "\n")
cat("Bias:        ", round(adj_ate - true_ate, 3), "\n")

# --- g-computation by hand -------------------------------------------------

df_treated <- df
df_treated$a <- 1

df_control <- df
df_control$a <- 0

y_hat_treated <- predict(fit_adj, newdata = df_treated)
y_hat_control <- predict(fit_adj, newdata = df_control)

gcomp_ate <- mean(y_hat_treated - y_hat_control)
cat("G-computation ATE:", round(gcomp_ate, 3), "\n")
cat("True ATE:         ", round(true_ate, 3), "\n")

# --- causal forest ATE ------------------------------------------------------

covariate_cols <- c(
  "age", "male", "nz_european", "education", "partner", "employed",
  "log_income", "nz_dep", "agreeableness", "conscientiousness",
  "extraversion", "neuroticism", "openness",
  "community_t0", "wellbeing_t0"
)

X <- as.matrix(df[, covariate_cols])
Y <- df$y
W <- df$a

cf <- causal_forest(
  X, Y, W,
  num.trees = 1000,
  honesty = TRUE,
  tune.parameters = "all",
  seed = 2026
)

ate_cf <- average_treatment_effect(cf)
cat("Causal forest ATE:", round(ate_cf["estimate"], 3),
    "(SE:", round(ate_cf["std.err"], 3), ")\n")
cat("True ATE:         ", round(true_ate, 3), "\n")

# --- compare all estimates --------------------------------------------------

results <- data.frame(
  method = c("Naive", "Adjusted regression", "G-computation", "Causal forest"),
  estimate = c(naive_ate, adj_ate, gcomp_ate, ate_cf["estimate"]),
  bias = c(naive_ate - true_ate, adj_ate - true_ate,
           gcomp_ate - true_ate, ate_cf["estimate"] - true_ate)
)
results$estimate <- round(results$estimate, 3)
results$bias <- round(results$bias, 3)
print(results)
cat("\nTrue ATE:", round(true_ate, 3), "\n")