# ============================================================================
# PSYC 434 — Lab 8: RATE and QINI Curves
# self-standing script — run from top to bottom
# ============================================================================

# --- packages ---------------------------------------------------------------

library(causalworkshop)
library(grf)
library(tidyverse)

# --- fit causal forest (from Labs 5-6) --------------------------------------

d <- simulate_nzavs_data(n = 5000, seed = 2026)
d0 <- d |> filter(wave == 0)
d1 <- d |> filter(wave == 1)
d2 <- d |> filter(wave == 2)

covariate_cols <- c(
  "age", "male", "nz_european", "education", "partner", "employed",
  "log_income", "nz_dep", "agreeableness", "conscientiousness",
  "extraversion", "neuroticism", "openness",
  "community_group", "wellbeing"
)

X <- as.matrix(d0[, covariate_cols])
Y <- d2$wellbeing
W <- d1$community_group

cf <- causal_forest(
  X, Y, W,
  num.trees = 1000,
  honesty = TRUE,
  tune.parameters = "all",
  seed = 2026
)

tau_hat <- predict(cf)$predictions

# --- rank individuals by predicted benefit ----------------------------------

n <- length(tau_hat)
tau_order <- order(tau_hat, decreasing = TRUE)
tau_sorted <- tau_hat[tau_order]

cat("Top 5 predicted effects:   ", round(head(tau_sorted, 5), 3), "\n")
cat("Bottom 5 predicted effects:", round(tail(tau_sorted, 5), 3), "\n")
cat("Overall mean:              ", round(mean(tau_hat), 3), "\n")

# --- RATE curve -------------------------------------------------------------

rates <- seq(0.05, 1.00, by = 0.05)
rate_results <- tibble(
  rate = numeric(),
  avg_tau_targeted = numeric(),
  gain_over_random = numeric()
)

for (r in rates) {
  n_targeted <- floor(r * n)
  targeted_idx <- tau_order[seq_len(n_targeted)]

  avg_targeted <- mean(tau_hat[targeted_idx])
  gain <- avg_targeted - mean(tau_hat)

  rate_results <- bind_rows(
    rate_results,
    tibble(rate = r, avg_tau_targeted = avg_targeted, gain_over_random = gain)
  )
}

print(rate_results |> mutate(across(where(is.numeric), \(x) round(x, 3))))

# plot RATE curve
ggplot(rate_results, aes(x = rate, y = gain_over_random)) +
  geom_line(colour = "#E69F00", linewidth = 1) +
  geom_point(colour = "#E69F00", size = 2) +
  scale_x_continuous(labels = scales::percent_format()) +
  labs(
    title = "RATE curve: gain from targeting",
    x = "Targeting rate (proportion treated)",
    y = "Gain over random assignment"
  ) +
  theme_minimal()

# --- QINI curve -------------------------------------------------------------

qini_results <- tibble(
  percentile = numeric(),
  cumulative_gain = numeric()
)

for (p in rates) {
  n_top <- floor(p * n)
  top_idx <- tau_order[seq_len(n_top)]
  cum_gain <- sum(tau_hat[top_idx]) - p * sum(tau_hat)

  qini_results <- bind_rows(
    qini_results,
    tibble(percentile = p, cumulative_gain = cum_gain)
  )
}

print(qini_results |> mutate(across(where(is.numeric), \(x) round(x, 3))))

# plot QINI curve
ggplot(qini_results, aes(x = percentile, y = cumulative_gain)) +
  geom_line(colour = "#56B4E9", linewidth = 1) +
  geom_point(colour = "#56B4E9", size = 2) +
  scale_x_continuous(labels = scales::percent_format()) +
  labs(
    title = "QINI curve: cumulative targeting gain",
    x = "Population percentile",
    y = "Cumulative gain over random"
  ) +
  theme_minimal()

# area under QINI curve
auqc <- sum(qini_results$cumulative_gain * 0.05)
cat("Area Under QINI Curve (AUQC):", round(auqc, 3), "\n")

# --- targeting efficiency ---------------------------------------------------

top_10_idx <- tau_order[seq_len(floor(0.10 * n))]
top_20_idx <- tau_order[seq_len(floor(0.20 * n))]
top_50_idx <- tau_order[seq_len(floor(0.50 * n))]

efficiency <- tibble(
  group = c("Top 10%", "Top 20%", "Top 50%", "Everyone"),
  avg_effect = c(
    mean(tau_hat[top_10_idx]),
    mean(tau_hat[top_20_idx]),
    mean(tau_hat[top_50_idx]),
    mean(tau_hat)
  ),
  lift_vs_random = c(
    mean(tau_hat[top_10_idx]) / mean(tau_hat),
    mean(tau_hat[top_20_idx]) / mean(tau_hat),
    mean(tau_hat[top_50_idx]) / mean(tau_hat),
    1
  )
) |>
  mutate(efficiency_gain_pct = round((lift_vs_random - 1) * 100, 1))

print(efficiency |> mutate(across(where(is.numeric), \(x) round(x, 3))))

# --- covariate profile of high-benefit individuals --------------------------

top_10_data <- d0[tau_order[seq_len(floor(0.10 * n))], ]
everyone <- d0

cat("Top 10% vs everyone:\n")
cat("  Extraversion:    ", round(mean(top_10_data$extraversion), 2),
    "vs", round(mean(everyone$extraversion), 2), "\n")
cat("  Neuroticism:     ", round(mean(top_10_data$neuroticism), 2),
    "vs", round(mean(everyone$neuroticism), 2), "\n")
cat("  Partner (prop):  ", round(mean(top_10_data$partner), 2),
    "vs", round(mean(everyone$partner), 2), "\n")
cat("  Agreeableness:   ", round(mean(top_10_data$agreeableness), 2),
    "vs", round(mean(everyone$agreeableness), 2), "\n")