# ============================================================================ # PSYC 434 — Lab 3: Regression, Graphing, and Simulation # self-standing script — run from top to bottom # ============================================================================ # --- packages --------------------------------------------------------------- library(tidyverse) library(ggplot2) library(patchwork) library(parameters) library(report) library(ggeffects) # --- simulate continuous and categorical data ------------------------------- set.seed(123) n <- 100 mean <- 50 sd <- 10 data_continuous <- rnorm(n, mean, sd) head(data_continuous) hist(data_continuous) levels <- c("Male", "Female") data_categorical <- sample(levels, n, replace = TRUE) table(data_categorical) data_categorical_unequal <- sample(levels, n, replace = TRUE, prob = c(0.3, 0.7)) table(data_categorical_unequal) # --- simulating outcomes from treatments ------------------------------------ set.seed(123) groupA_scores <- rnorm(100, mean = 100, sd = 15) groupB_scores <- rnorm(100, mean = 105, sd = 15) df_scores <- data.frame( group = rep(c("A", "B"), each = 100), scores = c(groupA_scores, groupB_scores) ) df_scores_1 <- df_scores |> mutate(group = as.factor(group)) # box plot ggplot(df_scores_1, aes(x = group, y = scores, fill = group)) + geom_boxplot() + theme_minimal() + labs(title = "score distribution by group", x = "group", y = "scores") # histogram with facets ggplot(df_scores_1, aes(x = scores, fill = group)) + geom_histogram(binwidth = 5, color = "black") + labs(title = "distribution of scores by group", x = "scores", y = "frequency") + facet_wrap(~group, ncol = 1) + theme_minimal() # --- statistical tests ------------------------------------------------------ # one-sample t-test set.seed(123) data <- rnorm(100, mean = 5, sd = 1) mod_first_test <- t.test(data, mu = 4) parameters::parameters(mod_first_test) report::report(mod_first_test) # two-sample t-test group1 <- rnorm(50, mean = 5, sd = 1) group2 <- rnorm(50, mean = 5.5, sd = 1) mod_t_test_result <- t.test(group1, group2) report::report(mod_t_test_result) # paired t-test pre_test <- rnorm(30, mean = 80, sd = 10) post_test <- rnorm(30, mean = pre_test + 5, sd = 5) mod_pre_post <- t.test(pre_test, post_test, paired = TRUE) report::report(mod_pre_post) # --- regression using simulation -------------------------------------------- set.seed(123) n <- 100 treatment <- rnorm(n, mean = 50, sd = 10) beta_a <- 2 outcome <- 5 + beta_a * treatment + rnorm(n, mean = 0, sd = 20) df <- data.frame(treatment = treatment, outcome = outcome) fit <- lm(outcome ~ treatment, data = df) summary(fit) predicted_values <- ggeffects::ggemmeans(fit, terms = c("treatment")) plot(predicted_values, dot_alpha = 0.35, show_data = TRUE, jitter = .1) # --- equivalence of ANOVA and regression ------------------------------------ set.seed(123) n <- 90 k <- 3 group <- factor(rep(1:k, each = n / k)) means <- c(100, 100, 220) sd <- 15 y <- rnorm(n, mean = rep(means, each = n / k), sd = sd) df_1 <- cbind.data.frame(y, group) # ANOVA anova_model <- aov(y ~ group, data = df_1) report::report(anova_model) # regression (equivalent) fit_regression <- lm(y ~ group, data = df_1) parameters::model_parameters(fit_regression) # --- combination plots ------------------------------------------------------ coefficient_plot <- plot(parameters::model_parameters(fit_regression)) predictive_plot <- plot( ggeffects::ggpredict(fit_regression, terms = "group"), dot_alpha = 0.35, show_data = TRUE, jitter = .1, colors = "reefs" ) + scale_y_continuous(limits = c(0, 260)) + labs(title = "predictive graph", x = "treatment group", y = "response") my_first_combination_plot <- coefficient_plot / predictive_plot + plot_annotation(title = "coefficient and predictive plots", tag_levels = "A") my_first_combination_plot # --- post hoc comparisons -------------------------------------------------- tukey_post_hoc <- TukeyHSD(anova_model) print(tukey_post_hoc) plot(tukey_post_hoc) # --- adding complexity in simulation ---------------------------------------- data_frame <- data.frame( ID = 1:n, Gender = sample(c("Male", "Female"), n, replace = TRUE), Age = rnorm(n, mean = 30, sd = 5), Income = rnorm(n, mean = 50000, sd = 10000) ) # income as a function of age intercept <- 20000 beta_age <- 1500 error_sd <- 10000 Age <- rnorm(n, mean = 30, sd = 5) Income <- intercept + beta_age * Age + rnorm(n, mean = 0, sd = error_sd) data_complex <- data.frame(Age, Income) ggplot(data_complex, aes(x = Age, y = Income)) + geom_point() + theme_minimal() + labs(title = "simulated age vs. income", x = "age", y = "income")