# ============================================================================
# psyc 434 -- lab 4: student practice script
# self-standing script -- run from top to bottom
# ============================================================================

# --- packages ---------------------------------------------------------------
# install only the packages that are missing
required_packages <- c("tidyverse")

missing_packages <- required_packages[
  !vapply(required_packages, \(pkg) requireNamespace(pkg, quietly = TRUE), logical(1))
]

if (length(missing_packages) > 0) {
  install.packages(missing_packages)
}

library(tidyverse)

# start here.
# work one exercise at a time.
# change only the model formula after `~`.
# then rerun the model and the lines just below it.
#
# this script has three core exercises.
# 1. one predictor:
#    fit `exam_score ~ study_hours`, then change it to `exam_score ~ 1`
#    and see what the fitted line becomes.
# 2. add a second predictor:
#    start with `exam_score ~ study_hours`, then add `motivation`
#    and see how the coefficient for `study_hours` changes.
# 3. add an interaction:
#    start with `exam_score ~ study_hours + workshop`, then change it to
#    `exam_score ~ study_hours * workshop` and compare the fitted lines.
#
# the aim is not to memorise syntax.
# the aim is to notice what each change in the formula does.
#
# questions to answer as you work:
# - in exercise 1, what happens to the fitted line when you change
#   `exam_score ~ study_hours` to `exam_score ~ 1`?
# - in exercise 2, does the coefficient for `study_hours` get larger or
#   smaller after adding `motivation`?
# - in exercise 3, what changes when you replace `+` with `*` in the formula?
# - which formula felt easiest to interpret, and which felt hardest?


# --- exercise 1: one predictor ----------------------------------------------
# `set.seed()` makes the random numbers reproducible
set.seed(434)

# `n` is the number of students we simulate
n <- 200

# `rnorm()` draws random values from a normal distribution
study_hours <- rnorm(n, mean = 10, sd = 2)

# exam score follows a linear relationship with study hours
# the final `rnorm()` term adds random error
exam_score <- 40 + 4 * study_hours + rnorm(n, mean = 0, sd = 5)

# `tibble()` makes a data frame
df_simple <- tibble(
  study_hours = study_hours,
  exam_score = exam_score
)

ggplot(df_simple, aes(x = study_hours, y = exam_score)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "exercise 1",
    subtitle = "one predictor, one outcome"
  )

# coding practice:
# 1. run the model below as written
# 2. change the formula to `exam_score ~ 1`
# 3. rerun the summary and the fitted-value plot
# 4. question: what happens to the fitted line?

# edit here
fit_1 <- lm(
  exam_score ~ study_hours,
  data = df_simple
)

summary(fit_1)

pred_1 <- df_simple |>
  mutate(fitted_value = predict(fit_1)) |>
  arrange(study_hours)

ggplot(pred_1, aes(x = study_hours, y = exam_score)) +
  geom_point(alpha = 0.6) +
  geom_line(aes(y = fitted_value), colour = "steelblue", linewidth = 1) +
  labs(
    title = "exercise 1 fitted values",
    subtitle = "rerun this after you change the formula"
  )


# --- exercise 2: add a second predictor -------------------------------------
set.seed(435)
n <- 300

# motivation is another normally distributed variable
motivation <- rnorm(n, mean = 0, sd = 1)

# study hours depend partly on motivation
study_hours <- 8 + 1.5 * motivation + rnorm(n, mean = 0, sd = 1.5)

# exam score depends on both study hours and motivation
exam_score <- 55 + 2.5 * study_hours + 6 * motivation + rnorm(n, mean = 0, sd = 6)

df_multiple <- tibble(
  study_hours = study_hours,
  motivation = motivation,
  exam_score = exam_score
)

ggplot(df_multiple, aes(x = motivation, y = study_hours)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "exercise 2",
    subtitle = "motivation is related to study hours"
  )

# coding practice:
# 1. run the model below as written
# 2. look at the coefficient for `study_hours`
# 3. change the formula to `exam_score ~ study_hours + motivation`
# 4. rerun the summary and see how the coefficient changes
# 5. question: does the `study_hours` coefficient get larger or smaller?

# edit here
fit_2 <- lm(
  exam_score ~ study_hours,
  data = df_multiple
)

summary(fit_2)


# --- exercise 3: interaction ------------------------------------------------
set.seed(437)
n <- 320

# `runif()` draws values evenly between 0 and 10
study_hours <- runif(n, min = 0, max = 10)

# `rbinom(n, 1, 0.5)` draws 0 or 1
# here, 1 means attended the workshop
workshop <- rbinom(n, 1, 0.5)

# this outcome has an interaction
# the slope for study hours is steeper when workshop = 1
exam_score <- 45 + 2 * study_hours + 5 * workshop +
  1.5 * study_hours * workshop + rnorm(n, mean = 0, sd = 4)

df_interaction <- tibble(
  study_hours = study_hours,
  workshop = workshop,
  exam_score = exam_score
)

ggplot(
  df_interaction,
  aes(x = study_hours, y = exam_score, colour = factor(workshop))
) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "exercise 3",
    subtitle = "the lines are not parallel",
    colour = "workshop"
  )

# coding practice:
# 1. run the model below as written
# 2. rerun the summary and fitted-value plot
# 3. change the formula to `exam_score ~ study_hours * workshop`
# 4. rerun the same lines and compare the fitted lines
# 5. question: when you replace `+` with `*`, do the fitted lines stay parallel?

# edit here
fit_3 <- lm(
  exam_score ~ study_hours + workshop,
  data = df_interaction
)

summary(fit_3)

pred_3 <- df_interaction |>
  mutate(fitted_value = predict(fit_3)) |>
  arrange(workshop, study_hours)

ggplot(
  pred_3,
  aes(x = study_hours, y = exam_score, colour = factor(workshop))
) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = fitted_value), linewidth = 1) +
  labs(
    title = "exercise 3 fitted values",
    subtitle = "rerun this after you change the formula",
    colour = "workshop"
  )


# if you finish early, open `scripts/lab-04.R`
# that file has extra annotation, more exercises, and an optional extension