# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # EVALUATION OF RANDOMIZED CONTROLLED TRIALS USING # # R: A TUTORIAL FOR MENTAL HEALTH RESEARCHERS # # Complete Analysis Code # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## 1. Introduction ------------------------------------------------------------ ## 2. Preparation ------------------------------------------------------------- ### 2.1. R and RStudio -------------------------------------------------------- ### 2.2. Packages ------------------------------------------------------------- install.packages("tidyverse") library(tidyverse) ### 2.3. Importing Data ------------------------------------------------------- library(openxlsx) data <- read.xlsx("data.xlsx") ### 2.3. Functions ------------------------------------------------------------ sqrt(9) max(data$age) mean(data$cesd.1) mean(data$cesd.1, na.rm = TRUE) ### 2.5 Objects --------------------------------------------------------------- birth_year <- 1985 # Call the saved object "birth_year" birth_year 1985 -> birth_year birth_year = 1985 assign("birth_year", 1985) vector <- c(8, 74, 1, 6) class(data$age) glimpse(data) name <- c("Sarah", "Paula", "Antonia", "Peter") age <- c(45, 31, 31, 27) df <- data.frame(name, age, likes_liquorice); df colnames(df) # Show the dataframe's columns names ncol(df) # Show the number of columns rownames(df) # Show the dataframe's row names nrow(df) # Show the number of rows name <- "liquorice data" list <- list(data = df, title = name) ### 2.6 Operators ------------------------------------------------------------- # Using a semi-colon, we can write several lines of code in one: 5+8; 6^3; 6*(10/3) # Check if "Capitalized" is equal to "capitalized" # Note the upper vs. lowercase spelling "Capitalized" == "capitalized" # Define two objects (x,y); then test if two conditions are fulfilled: x <- 20; y <- 9 x > 10 & y != 10 data$age > 45 data %>% pull(age) %>% mean() %>% log() ### 2.7 Slicing & Indexing ---------------------------------------------------- data[4,7] data[1:3, c(16, 6)] data[,21] data[1,"age"] data[data$age > 40,"cesd.0"] data %>% filter(age > 40, sex == 0) %>% select(cesd.0, cesd.1, cesd.2) %>% head(3) ## 3. Descriptive Analysis ---------------------------------------------------- library(skimr) skim(data) data$sex <- as.numeric(data$sex) data$prevpsychoth <- as.numeric(data$prevpsychoth) data$prevtraining <- as.numeric(data$prevtraining) data$child <- as.numeric(data$child) data$employ <- as.numeric(data$employ) data$degree <- as.numeric(data$degree) data %>% filter(group == 0) %>% skim() # Control group data %>% filter(group == 1) %>% skim() # Intervention group library(psych) multi.hist(data %>% select(cesd.0, cesd.1, cesd.2), ncol = 3) x <- c(1, 2, 3, 4, 5, NA, NA, 200) # Create vector sum(is.na(x)) # Count the number of missings # All trial data with(data, { c(sum(is.na(cesd.0)), sum(is.na(cesd.1)), sum(is.na(cesd.2))) }) -> na.all # Calculate percentages na.all.p <- na.all/nrow(data) na.all.p # Do the same for intervention group only... data %>% filter(group == 1) %>% with({ c(sum(is.na(cesd.0)), sum(is.na(cesd.1)), sum(is.na(cesd.2))) }) -> na.ig na.ig.p <- na.ig/nrow(data %>% filter(group == 1)) # ... and control group only data %>% filter(group == 0) %>% with({ c(sum(is.na(cesd.0)), sum(is.na(cesd.1)), sum(is.na(cesd.2))) }) -> na.cg na.cg.p <- na.cg/nrow(data %>% filter(group == 0)) # Collect all data in dataframe na <- data.frame(na.all, na.all.p = na.all.p*100, na.ig, na.ig.p = na.ig.p*100, na.cg, na.cg.p = na.cg.p*100) # Give the rows fitting names rownames(na) = c("t0", "t1", "t2") round(na, 2) ## 4. Missing Data ------------------------------------------------------------ # Define some example data; x has three missings y <- 1:10 x <- c(1, NA, NA, NA, 3, 5, 8, 10, -1, 10) summary(lm(y ~ x)) ### 4.1 Missing Data Mechanism ------------------------------------------------ ### 4.2 Multiple Imputation --------------------------------------------------- library(mice) library(miceadds) md.pattern(data) flux(data)[,1:3] data$R <- !is.na(data$cesd.1) data.cg <- data %>% filter(group == 0) data.ig <- data %>% filter(group == 1) # R is not needed for other analyses, so # we remove it from the original data data$R = NULL glm(R ~ scale(cesd.0) + scale(badssf.0) + scale(hadsa.0) + scale(age) + sex + prevpsychoth + prevtraining + child + employ + as.factor(rel) + as.factor(degree) + scale(atsphs.0) + scale(ceq), binomial(link = "logit"), data.ig) %>% summary() glm(R ~ scale(cesd.0) + scale(badssf.0) + scale(hadsa.0) + scale(age) + sex + prevpsychoth + prevtraining + child + employ + as.factor(rel) + as.factor(degree) + scale(atsphs.0) + scale(ceq), binomial(link = "logit"), data.cg) %>% summary() inlist <- c("badssf.0", "hadsa.0", "prevpsychoth", "rel") imp0 <- mice(data, maxit = 0) imp0$loggedEvents # Include all variables with logged events ("id" in our case) # in our "outlist"; then remove those variables from the data frame outlist <- imp0$loggedEvents$out imp.data <- data %>% select(-all_of(outlist)) pred <- quickpred(imp.data, mincor = 0.05, minpuc = 0.05, include = inlist) # Mean number of predictors table(rowSums(pred))[-1] %>% {as.numeric(names(.)) %*% . / sum(.)} # Set predictor "group" to zero pred[,"group"] = 0 library(plot.matrix) plot(pred, main = "Imputation Matrix", col = c("grey", "blue"), xlab = "predicts", ylab = "is predicted", adj = 0, las = 2, cex.axis = 0.5) imp0 = mice(imp.data, maxit = 0, predictorMatrix = pred) imp0$method # Find variables without missings no.missings = imp0$method == "" # Set imputation method to "bygroup" imp0$method %>% replace(., . != "", "bygroup") -> imp.method # Define the actual imputation method for all # "bygroup" variables imp0$method[!no.missings] %>% as.list() -> imp.function library(purrr) rep("group", length(imp.method[!no.missings])) %>% set_names(names(imp.method[!no.missings])) %>% as.list() -> imp.group.variable mice(imp.data, predictorMatrix = pred, method = imp.method, imputationFunction = imp.function, group = imp.group.variable, m = 50, maxit = 50, seed = 123) -> imp save(imp, file="imp.rda") plot(imp, layout = c(4, ceiling(sum(!no.missings)/2))) densityplot(imp, ~ cesd.1 + cesd.2 | as.factor(group)) remotes::install_github("UCL/RefBasedMI") library(RefBasedMI) data$id = 1:nrow(data) pivot_longer(data = data, ends_with(c(".1", ".2")), values_to = "value", names_to = c("outcome", "time"), names_pattern = "(.*).(.)") -> data.long data.long$time data.long %>% mutate(time = recode(time, `1` = 7, `2` = 12)) -> data.long data.long[1:10, c("id", "cesd.0", "group", "outcome", "time", "value")] data.long %>% filter(outcome == "cesd") -> data.long.cesd imp.j2r <- RefBasedMI(data = data.long.cesd, covar = c(age, sex, prevpsychoth, prevtraining, child, employ, rel, cesd.0, atsphs.0, ceq, badssf.0, hadsa.0), depvar = value, treatvar = group, idvar = id, timevar = time, method = "J2R", reference = 0, M = 50, seed = 123) %>% as.mids() save(imp.j2r, file="imp.j2r.rda") ### 4.3 Rubin's Combination Rules --------------------------------------------- ### 4.4 Functional Programming ------------------------------------------------ library(mitml) implist <- mids2mitml.list(imp) # Create an empty vector in which our results will be stored means <- vector() # Loop through all m=50 imputation sets for (i in 1:50){ means[i] <- implist[[i]] %>% pull(cesd.1) %>% mean() } mean(means) implist %>% map_dbl(~mean(.$cesd.1)) %>% mean() # 'x' represents the individual list element in 'list' map(list, function(x) mean(x$variable)) # '.' represents the individual list element in 'list' map(list, ~ mean(.$variable)) ## 5. Effectiveness Analysis -------------------------------------------------- ### 5.1 Continuous Endpoints -------------------------------------------------- library(mitml) library(purrr) library(miceadds) m.lm <- lm(cesd.1 ~ 1 + group + scale(cesd.0), data = imp$data) summary(m.lm) anova(m.lm) implist <- mids2mitml.list(imp) class(implist) with(implist, lm(cesd.1 ~ 1 + group + cesd.0)) %>% testEstimates() # df.com is defined as sample size minus the number of parameters: df.com <- nrow(imp$data) - length(coef(m.lm)) with(implist, lm(cesd.1 ~ 1 + group + cesd.0)) %>% testEstimates(df.com = df.com) with(implist, lm(cesd.1 ~ 1 + group + cesd.0)) %>% map_dbl(~anova(.)$`F value`[1]) -> Fvalues micombine.F(Fvalues, df1=1) implist.j2r <- mids2mitml.list(imp.j2r) map(implist.j2r, function(x){ x %>% mutate(time = recode(time, `7` = 1, `12` = 2)) %>% pivot_wider(names_from = c(outcome, time), names_sep = ".", values_from = value) }) -> implist.j2r class(implist.j2r) implist.j2r <- as.mitml.list(implist.j2r) with(implist.j2r, lm(cesd.1 ~ 1 + group + cesd.0)) %>% testEstimates() # Calculate mean SD for both groups across imputed data sets implist %>% map_dbl(~ filter(., group==1) %>% pull(cesd.1) %>% sd()) %>% mean() -> s_ig implist %>% map_dbl(~ filter(., group==0) %>% pull(cesd.1) %>% sd()) %>% mean() -> s_cg # Define sample size in both groups n_ig <- n_cg <- 273 # Calculate pooled SD sqrt(((n_ig-1)*s_ig^2 + (n_cg-1)*s_cg^2)/ (n_ig+n_cg-2)) (m.es <- with(implist, lm(I(cesd.1/8.586) ~ 1 + group + cesd.0)) %>% testEstimates()) confint(m.es) (m.es.cca <- lm(I(cesd.1/8.586) ~ 1 + group + cesd.0, data = imp$data)) confint(m.es.cca) (m.es.j2r <- with(implist.j2r, lm(I(cesd.1/8.586) ~ 1 + group + cesd.0)) %>% testEstimates()) confint(m.es.j2r) ### 5.2 Binary Endpoints ------------------------------------------------------ rci <- function(y0, y1, ryy){ diff = y1-y0 sdiff = sqrt(2*((sd(y0)*sqrt(1-ryy))^2)) return(diff/sdiff) } ryy <- 0.85 implist %>% map_dbl(~sqrt(2*((sd(.$cesd.0)*sqrt(1-ryy))^2))) %>% mean() -> s_diff s_diff implist %>% map(function(x){ # Calculate RCI for each person x$rci = with(x, rci(y0 = cesd.0, y1 = cesd.1, ryy = ryy)) # Create binary indicator for # reliable improvement & deterioration x$ri = ifelse(x$rci <= -1.96, 1, 0) x$rd = ifelse(x$rci >= 1.96, 1, 0) x }) -> implist # Get the number of imputation sets m <- length(implist) # Generate a reliable improvement counts table # Aggregate the values across all data sets. implist %>% map(~ with(., table(group, ri))) %>% {Reduce(`+`, .)/m} %>% as.matrix() %>% round() -> table.ri table.ri # Calculate sample size in both groups n.ig <- sum(table.ri[2,]) n.cg <- sum(table.ri[1,]) # Calculate proportion of reliable improvement cases p.ig <- table.ri[2,2]/n.ig p.cg <- table.ri[1,2]/n.cg # Calculate NNT (inverse risk reduction) (p.ig - p.cg)^-1 dat.plt.ig <- implist[[1]] %>% filter(group == 1) dat.plt.cg <- implist[[1]] %>% filter(group == 0) # We require the scales package to be loaded. library(scales) # intervention group plot plot(dat.plt.ig$cesd.0, dat.plt.ig$cesd.1, xlim = c(0,50), ylim = c(0,50), xlab = "CES-D (Baseline)", ylab = "CES-D (Post-Test)", pch = 16, col = alpha("black", 0.3), main = "Intervention Group") abline(coef = c(0,1), col = "blue") polygon(c(0,0,50,50), c(0-s_diff*qnorm(.975),0+s_diff*qnorm(.975), 50+s_diff*qnorm(.975),50-s_diff*qnorm(.975)), col = alpha("lightblue", 0.3), border = NA) text(50, 0, "Improvement", pos=2, col="darkgreen") text(0, 50, "Deterioration", pos=4, col="darkred") # control group plot plot(dat.plt.cg$cesd.0, dat.plt.cg$cesd.1, xlim = c(0,50), ylim = c(0,50), xlab = "CES-D (Baseline)", ylab = "CES-D (Post-Test)", pch = 16, col = alpha("black", 0.3), main = "Control Group") abline(coef = c(0,1), col = "blue") polygon(c(0,0,50,50), c(0-s_diff*qnorm(.975),0+s_diff*qnorm(.975), 50+s_diff*qnorm(.975),50-s_diff*qnorm(.975)), col = alpha("lightblue", 0.3), border = NA) text(50, 0, "Improvement", pos=2, col="darkgreen") text(0, 50, "Deterioration", pos=4, col="darkred") with(implist, glm(ri ~ group + scale(cesd.0), family = binomial("logit"))) %>% testEstimates() -> m.glm; m.glm exp(m.glm$estimates["group", "Estimate"]) with(implist, glm(ri ~ group, family = binomial("logit"))) %>% testEstimates() -> m.glm.ua exp(m.glm.ua$estimates["group", "Estimate"]) m.glm <- glm(ri ~ group + scale(cesd.0), data = implist[[1]], family = binomial("logit")) library(stdReg) m.std <- stdGlm(m.glm, X = "group", data = implist[[1]]) summary(m.std, contrast = "ratio", reference = 0) summary(m.std, contrast = "difference", reference = 0) # Fit model with treatment-covariate interactions m.glm2 <- glm(ri ~ group*(scale(cesd.0) + scale(age) + sex), data = implist[[1]], family = binomial) # Return marginal risk ratio m.std2 <- stdGlm(m.glm2, X = "group", data = implist[[1]]) summary(m.std2, contrast = "ratio", reference = 0) implist %>% map_dfr(function(x){ # Run GLM and standardize std.glm <- glm(ri ~ group + scale(cesd.0), data = x, family = binomial) %>% stdGlm(., X = "group", data = x) # Calculate the standardized log-RR summary(std.glm, contrast = "ratio", transform = "log", reference = 0)$est.table["1", 1:2] }) -> std.mi pool.scalar( Q = std.mi$Estimate, U = std.mi$`Std. Error`^2)[c("qbar", "t", "df")] -> res.std.rr within(res.std.rr, { # Calculate CI based on t-distribution with MI degrees of freedom lower.qbar <- qbar - qt(0.975, df) * sqrt(t) upper.qbar <- qbar + qt(0.975, df) * sqrt(t) pval <- pt(qbar/sqrt(t), df, lower = FALSE)*2 # Use antilog to transform logRR values to RR rr <- exp(qbar) lower.rr <- exp(lower.qbar) upper.rr <- exp(upper.qbar) }) -> res.RR res.RR[c("rr", "lower.rr", "upper.rr")] implist %>% map_dfr(function(x){ # Run GLM and standardize std.glm <- glm(ri ~ group + scale(cesd.0), data = x, family = binomial) %>% stdGlm(., X = "group", data = x) # Calculate the standardized log-RR summary(std.glm, contrast = "difference", transform = "logit", reference = 0)$est.table["1", 1:2] }) -> std.mi pool.scalar( Q = std.mi$Estimate, U = std.mi$`Std. Error`^2)[c("qbar", "t", "df")] -> res.std.rd within(res.std.rd, { # Calculate CI based on t-distribution with MI degrees of freedom lower.qbar <- qbar - qt(0.975, df) * sqrt(t) upper.qbar <- qbar + qt(0.975, df) * sqrt(t) pval <- pt(qbar/sqrt(t), df, lower = FALSE)*2 # Use inverse logit to transform logit RD values to RD # We also have to subtract the inverse logit RD for the reference group rd <- plogis(qbar) - plogis(0) lower.rd <- plogis(lower.qbar) - plogis(0) upper.rd <- plogis(upper.qbar) - plogis(0) }) -> res.RD res.RD[c("rd", "lower.rd", "upper.rd")] as.numeric(res.RD[c("rd", "lower.rd", "upper.rd")])^-1 ## 6. Miscellaneous Topics ---------------------------------------------------- ### 6.1 Multiple Testing ------------------------------------------------------ # ANCOVA for post-test CES-D with(implist, lm(cesd.1 ~ 1 + group + cesd.0)) %>% map_dbl(~anova(.)$`F value`[1]) %>% micombine.F(df1=1) -> F.cesd # ANCOVA for post-test BADS-SF with(implist, lm(badssf.1 ~ 1 + group + badssf.0)) %>% map_dbl(~anova(.)$`F value`[1]) %>% micombine.F(df1=1) -> F.badssf # ANCOVA for post-test HADS-A with(implist, lm(hadsa.1 ~ 1 + group + hadsa.0)) %>% map_dbl(~anova(.)$`F value`[1]) %>% micombine.F(df1=1) -> F.hadsa # Collect all three p-values (p <- c(F.cesd["p"], F.badssf["p"], F.hadsa["p"])) p.adjust(p, method = "bonferroni") p.adjust(p, method = "holm") ### 6.2 Longitudinal Data ----------------------------------------------------- implist %>% map(function(x){ x %>% mutate(id = 1:nrow(x)) %>% pivot_longer(ends_with(c(".1", ".2")), values_to = "value", names_to = c("outcome", "time"), names_pattern = "(.*).(.)") %>% mutate(time = recode(time, `1` = 7, `2` = 12)) }) -> implist.long # Extract a slice of the first imputed data set implist.long[[1]][1:10, c("id", "outcome", "time", "value", "cesd.0", "badssf.0", "hadsa.0")] implist.long %>% map(~ filter(., outcome == "cesd") %>% mutate(id = as.factor(id), group = as.factor(group), time = recode(time, `7` = "7 wks", `12` = "12 wks") %>% as.factor()) ) -> implist.long library(mmrm) m.mmrm <- mmrm(value ~ group + scale(cesd.0) + us(time|group/id), method = "Kenward-Roger", data = implist.long[[1]]) summary(m.mmrm) implist.long %>% map(~ mmrm(value ~ group + scale(cesd.0) + us(time|group/id), method = "Kenward-Roger", data = .)) %>% testEstimates() implist.long %>% map(~ mmrm(value ~ group*time + scale(cesd.0) + us(time|group/id), method = "Kenward-Roger", data = .)) %>% testEstimates() implist.long %>% map(~ mutate(., time = factor(time, levels = c("7 wks", "12 wks")))) %>% map(~ mmrm(value ~ group*time + scale(cesd.0) + us(time|group/id), method = "Kenward-Roger", data = .)) %>% testEstimates() -> m.mmrm.7wk m.mmrm.7wk confint(m.mmrm.7wk) ### 6.3 Reporting & Reproducibility ------------------------------------------- m <- length(implist) catvars <- c("sex", "prevpsychoth", "prevtraining", "child", "employ", "rel", "degree", "group") implist %>% map(~mutate_at(., catvars, as.factor)) -> implist implist %>% map(~skimReport(.)) %>% {.[[1]]$numerics[,1] ->> var.names;.} %>% map(~.$numerics[,2:4]) %>% Reduce(`+`, .)/m -> num.desc.full implist %>% map(~filter(., group == 1)) %>% map(~skimReport(.)) %>% map(~.$numerics[,2:4]) %>% Reduce(`+`, .)/m -> num.desc.ig implist %>% map(~filter(., group == 0)) %>% map(~skimReport(.)) %>% map(~.$numerics[,2:4]) %>% Reduce(`+`, .)/m -> num.desc.cg tab.desc <- cbind(var.names, num.desc.full, num.desc.ig, num.desc.cg) tab.desc library(openxlsx) write.xlsx(tab.desc, file="imputed_descriptives.xlsx")