# ============================================================================
# PSYC 434 — Lab 3: Regression, Graphing, and Simulation
# self-standing script — run from top to bottom
# ============================================================================

# --- packages ---------------------------------------------------------------

library(tidyverse)
library(ggplot2)
library(patchwork)
library(parameters)
library(report)
library(ggeffects)

# --- simulate continuous and categorical data -------------------------------

set.seed(123)
n <- 100
mean <- 50
sd <- 10
data_continuous <- rnorm(n, mean, sd)
head(data_continuous)
hist(data_continuous)

levels <- c("Male", "Female")
data_categorical <- sample(levels, n, replace = TRUE)
table(data_categorical)

data_categorical_unequal <- sample(levels, n, replace = TRUE, prob = c(0.3, 0.7))
table(data_categorical_unequal)

# --- simulating outcomes from treatments ------------------------------------

set.seed(123)
groupA_scores <- rnorm(100, mean = 100, sd = 15)
groupB_scores <- rnorm(100, mean = 105, sd = 15)

df_scores <- data.frame(
  group = rep(c("A", "B"), each = 100),
  scores = c(groupA_scores, groupB_scores)
)

df_scores_1 <- df_scores |>
  mutate(group = as.factor(group))

# box plot
ggplot(df_scores_1, aes(x = group, y = scores, fill = group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "score distribution by group", x = "group", y = "scores")

# histogram with facets
ggplot(df_scores_1, aes(x = scores, fill = group)) +
  geom_histogram(binwidth = 5, color = "black") +
  labs(title = "distribution of scores by group", x = "scores", y = "frequency") +
  facet_wrap(~group, ncol = 1) +
  theme_minimal()

# --- statistical tests ------------------------------------------------------

# one-sample t-test
set.seed(123)
data <- rnorm(100, mean = 5, sd = 1)
mod_first_test <- t.test(data, mu = 4)
parameters::parameters(mod_first_test)
report::report(mod_first_test)

# two-sample t-test
group1 <- rnorm(50, mean = 5, sd = 1)
group2 <- rnorm(50, mean = 5.5, sd = 1)
mod_t_test_result <- t.test(group1, group2)
report::report(mod_t_test_result)

# paired t-test
pre_test <- rnorm(30, mean = 80, sd = 10)
post_test <- rnorm(30, mean = pre_test + 5, sd = 5)
mod_pre_post <- t.test(pre_test, post_test, paired = TRUE)
report::report(mod_pre_post)

# --- regression using simulation --------------------------------------------

set.seed(123)
n <- 100
treatment <- rnorm(n, mean = 50, sd = 10)
beta_a <- 2
outcome <- 5 + beta_a * treatment + rnorm(n, mean = 0, sd = 20)

df <- data.frame(treatment = treatment, outcome = outcome)

fit <- lm(outcome ~ treatment, data = df)
summary(fit)

predicted_values <- ggeffects::ggemmeans(fit, terms = c("treatment"))
plot(predicted_values, dot_alpha = 0.35, show_data = TRUE, jitter = .1)

# --- equivalence of ANOVA and regression ------------------------------------

set.seed(123)
n <- 90
k <- 3

group <- factor(rep(1:k, each = n / k))
means <- c(100, 100, 220)
sd <- 15
y <- rnorm(n, mean = rep(means, each = n / k), sd = sd)
df_1 <- cbind.data.frame(y, group)

# ANOVA
anova_model <- aov(y ~ group, data = df_1)
report::report(anova_model)

# regression (equivalent)
fit_regression <- lm(y ~ group, data = df_1)
parameters::model_parameters(fit_regression)

# --- combination plots ------------------------------------------------------

coefficient_plot <- plot(parameters::model_parameters(fit_regression))

predictive_plot <- plot(
  ggeffects::ggpredict(fit_regression, terms = "group"),
  dot_alpha = 0.35, show_data = TRUE, jitter = .1, colors = "reefs"
) +
  scale_y_continuous(limits = c(0, 260)) +
  labs(title = "predictive graph", x = "treatment group", y = "response")

my_first_combination_plot <- coefficient_plot / predictive_plot +
  plot_annotation(title = "coefficient and predictive plots", tag_levels = "A")

my_first_combination_plot

# --- post hoc comparisons --------------------------------------------------

tukey_post_hoc <- TukeyHSD(anova_model)
print(tukey_post_hoc)
plot(tukey_post_hoc)

# --- adding complexity in simulation ----------------------------------------

data_frame <- data.frame(
  ID = 1:n,
  Gender = sample(c("Male", "Female"), n, replace = TRUE),
  Age = rnorm(n, mean = 30, sd = 5),
  Income = rnorm(n, mean = 50000, sd = 10000)
)

# income as a function of age
intercept <- 20000
beta_age <- 1500
error_sd <- 10000
Age <- rnorm(n, mean = 30, sd = 5)
Income <- intercept + beta_age * Age + rnorm(n, mean = 0, sd = error_sd)
data_complex <- data.frame(Age, Income)

ggplot(data_complex, aes(x = Age, y = Income)) +
  geom_point() +
  theme_minimal() +
  labs(title = "simulated age vs. income", x = "age", y = "income")