### Tidyverse # A MUCH better way of wrangling data install.packages("tidyverse") # Make sure the tidyverse package is installed library(tidyverse) # Load tidyverse # tidyverse is actually a collection of a bunch of separate packages (see console) data <- read.csv(file="../Data/AOSI_small.csv", na.strings = ".") names(data) # quick way of viewing all variables in data frame View(data) ## Select # select() function used to remove variables and/or order them select(data, Identifiers, GROUP) # First argument is dataset, next arguments are variables you want to keep # in order you want them to appear (left to right) select(data, -Identifiers, -GROUP) # Can also specify variable you want to remove using - prefix select(data, GROUP, everything()) # Can use everything() to fill in all other data variables in order they orginally appeared # useful when reordering the variables ## Arrange # arrange() function used to order rows (observations) based on variables arrange(data, V06.aosi.Candidate_Age) # Sorting by ascending age. Note dataset is again 1st argument followed by variables arrange(data, GROUP, V06.aosi.Candidate_Age) # Can also do nested sorting using multiple variables (first by diagnosis, then age) arrange(data, GROUP, desc(V06.aosi.Candidate_Age)) # Can also specify descending order for specific variable(s) using desc() function ## Mutate # mutate() function used to create new variables mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE)) # creating Z score for 6 month AOSI total score. Not use of mean() to compute mean, sd() to compute standard deviation, # and na.rm=TRUE to remove missing values prior to calc of mean, sd. What happens if I omit na.rm=TRUE? mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)) # Can create multiple variables, just separate by comma mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v06_aosi_round = round(v06_aosi_zscore, digits = 3)) # Can also create variables based on other created variables in same mutate call. # Here I round my Z-scores to 3 places after creating them ## Filter # filter() function used to remove rows of data based on variables filter(data, GROUP=="HR_ASD") # Keep only rows with GROUP="HR_ASD". Use of == tests if "is equal to". Remember "=" saves values to an object like <- filter(data, GROUP=="HR_ASD"&V06.aosi.total_score_1_18>10) # Keep only rows with GROUP="HR_ASD" & AOSI at 6 months > 10. Use of & means BOTH need to be true to be in data filter(data, GROUP=="HR_ASD"|V06.aosi.total_score_1_18<10) # Use of | means "OR", only one (or more) needs to be true to be kept filter(data, GROUP!="HR_ASD") # Use of != denotes "NOT equal to", only those NOT equal to "HR_ASD" kept ## Pipe: %>% # Notice all of these functions have the dataset as the first argument # Suppose we have multiple data processing steps we want to do data_1 <- mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)) filter(data_1, GROUP=="HR_ASD") # Very cumbersome, need to save each intermediate step as a new R object # Instead, let's use the pipe operator, denoted by %>% mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)) %>% filter(GROUP=="HR_ASD") # This results in the same output, and in fact carries out the same processes. # How does this work? # %>% simply takes everything from the left of the symbol, and pastes it into the first # argument for the function call on the right. That is, the above is the same as filter(mutate(data, v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)), GROUP=="HR_ASD") # But MUCH CLEANER! # We also include it in the beginning data %>% mutate(v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)) %>% filter(GROUP=="HR_ASD") # and then save as a new dataset data_v2 <- data %>% mutate(v06_aosi_zscore = (V06.aosi.total_score_1_18-mean(V06.aosi.total_score_1_18, na.rm=TRUE))/sd(V06.aosi.total_score_1_18, na.rm=TRUE), v12_aosi_zscore = (V12.aosi.total_score_1_18-mean(V12.aosi.total_score_1_18, na.rm=TRUE))/sd(V12.aosi.total_score_1_18, na.rm=TRUE)) %>% filter(GROUP=="HR_ASD")