library(tidyverse) # Load data data <- read_csv(file="../Data/Cross-sec_full.csv", na = c(".","","NA")) # 1) Pasting together variables # Ex. Combining Risk and Diagnosis variables (creating GROUP variable) data$GROUP # We will use the following functions : paste0, grepl, gsub, ifelse # Let's go through them to see what they do with simply toy examples # 1) paste0 paste0("x", "y", "z") paste("x", "y", "z", sep="") a <- "x" b <- "y" c <- "z" paste0(a, b, c) # 2) gsub gsub("Pos", "ASD", "HR_Pos") gsub("V", "", "V24") # 3) if...else and ifelse() condition_1 <- "GO" condition_1 <- "STOP" condition_2 <- "REALLY GO" if(condition_1=="GO"|condition_2=="REALLY GO"){ print(2+2) print("PASS") }else{ print(2-2) print("FAIL") } ifelse(condition_1=="GO", "PASS", "FAIL") # Now let's start the pasting process data_v1 <- data %>% mutate(SSM_ASD_v24_edit = gsub("_ASD", "", SSM_ASD_v24)) ftable(data_v1$SSM_ASD_v24, data_v1$SSM_ASD_v24_edit) # Works, though doesn't look great data_v1 <- data %>% mutate(SSM_ASD_v24_edit = gsub("_ASD", "", SSM_ASD_v24), GROUP_v0 = paste0(`V24 demographics,Risk`, "_", SSM_ASD_v24_edit)) ftable(data_v1$GROUP, data_v1$GROUP_v0) ftable(data_v1$`V24 demographics,Risk`) # Let's try to get the two group variables to match data_v1 <- data %>% mutate(GROUP_v0 = ifelse(grepl("YES", SSM_ASD_v24), paste0(`V24 demographics,Risk`,"_","ASD"), ifelse(grepl("NO", SSM_ASD_v24), paste0(`V24 demographics,Risk`,"_","neg"), NA))) ftable(data_v1$GROUP, data_v1$GROUP_v0) # Great! Let's just remove the 36 data_v1 <- data %>% mutate(GROUP_v0 = gsub("_36_","_" , ifelse(grepl("YES", SSM_ASD_v24), paste0(`V24 demographics,Risk`,"_","ASD"), ifelse(grepl("NO", SSM_ASD_v24), paste0(`V24 demographics,Risk`,"_","neg"), NA)))) ftable(data_v1$GROUP, data_v1$GROUP_v0) # 2) Change reference level # Let's look at group differences in 24 month MSEL composite using ANOVA/linear regression lm(formula = `V24 mullen,composite_standard_score`~GROUP, data = data) # Can see reference level is HR_ASD. Want to change to LR negative factor(data$GROUP) data_v1 <- data %>% mutate(GROUP=relevel(factor(GROUP), ref="LR_neg")) data_v1$GROUP lm(formula = `V24 mullen,composite_standard_score`~GROUP, data = data_v1) # Can see reference level is LR_neg # 3) Summarize variables by group data_v1 <- data %>% group_by(GROUP) %>% summarise(mean_mullen_composite = mean(`V24 mullen,composite_standard_score`, na.rm=TRUE), sd_mullen_composite = sd(`V24 mullen,composite_standard_score`, na.rm=TRUE), sample_size = n()) data_v1 # 4) Pivot from wide to long and long to wide # Make sure visit in the same spot for each variable vars_to_convert <- names(data)[grepl("V06|V12|V24|V36", names(data))] vars_to_convert <- vars_to_convert[vars_to_convert!="V24 demographics,Risk"] # Convert to long data_long <- data %>% gather(variable, var_value, vars_to_convert) %>% separate(variable,c("Visit","Variable"),sep=3) %>% mutate(Variable=gsub("," ,"_", Variable)) %>% spread(key=Variable, value=var_value) # See annoying space in variable names, can easily fix data_long <- data %>% gather(variable, var_value, vars_to_convert) %>% separate(variable,c("Visit","Variable"),sep=3) %>% mutate(Variable=gsub("," ,"_" ,Variable), Variable=gsub(" ","",Variable)) %>% spread(key=Variable, value=var_value) # Can also use pivot_longer instead of gather (see online if interested) # Convert back to wide data_wide <- data_long %>% group_by(Identifiers) %>% gather(names(data_long)[!(names(data_long)%in%c("Identifiers", "SSM_ASD_v24", "V24 demographics,Risk", "GROUP", "Study_Site", "Gender", "Visit"))], key=variable, value=number) %>% unite(combi, variable, Visit) %>% spread(combi, number) # 5) Create Z scores # Suppose we want to create Z scores for Mullen composite at each time point using LR Neg # First, compute time-specific means and SDs Mullen composite for LR Negative as comparison population LR_mullen_stats <- data_long %>% filter(GROUP=="LR_neg") %>% group_by(Visit) %>% summarise(mean_mullen_composite = mean(mullen_composite_standard_score, na.rm=TRUE), sd_mullen_composite = sd(mullen_composite_standard_score, na.rm=TRUE)) # Now let's add these to the dataset. We need to merge them in data_with_LRmeans <- inner_join(data_long, LR_mullen_stats, by="Visit") View(data_with_LRmeans) data_with_zscore <- data_with_LRmeans %>% mutate(mullen_composite_zscore = (mullen_composite_standard_score-mean_mullen_composite)/sd_mullen_composite) # Let's look at boxplots of these z scores ggplot(data=data_with_zscore, mapping=aes(x=Visit, y=mullen_composite_zscore, fill=GROUP))+ geom_boxplot() # 6) IBIS real analysis examples # a) Convert visiting to a numeric variable for trjaectory analysis or regression data_long$Visit # Need to remove V and remove quotes " " to make numeric data_long <- data_long %>% mutate(visit_num = gsub("V", "", Visit)) data_long$visit_num # NOT yet numeric, can see quotes. Use as.numeric() to force into numeric and remove quotes data_long <- data_long %>% mutate(visit_num = as.numeric(gsub("V", "", Visit))) data_long$visit_num # Why did we have to remove the V first before converting to numeric with as.numeric()? # b) requests? # str_split()