library(tidyverse) # functions for tidy data manipulation library(cowplot) # pretty-looking graphs rm(list=ls()) # clear the decks options(stringsAsFactors = FALSE) # useful option for non-Tidyverse work ### Data import ### data_wide <- read_tsv('Incucyte_all.txt') %>% print() ### Data tidying ### # convert wide data to narrow/tidy format data_narrow <- gather(data_wide, key='treatment_group', value='confluence', EGFP_untreated_1:Ptf1a_DOX_3) %>% print() # split treatment_group variable into separate variables for overexpressed gene, treatment, and replicate data_narrow <- separate(data_narrow, treatment_group, c('gene', 'treatment', 'rep')) %>% print() ### Using "mutate" to create/change data columns - a toy example ### # create toy data temp_df <- bind_cols(sample=c(1,2,3), temp=c(-40, 32, 98.6)) %>% print() # use "mutate" to create new column with temperature in Celsius temp_df <- mutate(temp_df, tempC=(temp-32)*5/9) %>% print() # create toy data, again temp_df <- bind_cols(time=c(1,2,3), temp=c(-40, 32, 98.6)) # now let's add both Celsius and Kelvin temperatures in one command temp_df <- mutate(temp_df, tempC=(temp-32)*5/9, tempK=tempC+273.15) %>% rename(tempF = temp) %>% # and also rename original variable "temp" to "tempF" print() ### Using "mutate" on our proliferation data ### # change treatment to ordered factor data_narrow <- mutate(data_narrow, treatment=factor(treatment, levels=c('untreated', 'DOX'))) %>% print() ### Comparing the Tidyverse approach to base R ### # for comparison to base R approach, read in original data via "read.delim" function data_baseR <- read.delim('Incucyte_all.txt', sep='\t') # note that the only easy way to convert a wide data frame to narrow, outside of the Tidyverse, is using # the "melt" command from the "reshape" package - which is something Hadley Wickham developed before he # came up with the extended Tidyverse data_narrow_baseR <- reshape::melt.data.frame(data_baseR, id.vars=c('Line', 'Elapsed', 'Date', 'Experiment'), variable_name='treatment_group') # note that "melt" doesn't let you specify the name of the variable column, it just calls it "value" # by default - so we have to manually rename it as 'confluence' colnames(data_narrow_baseR)[colnames(data_narrow_baseR)=='value'] <- 'confluence' # now, let's split up the "treatment_group" variable # first, though, we need to convert "treatment_group" from a factor variable (the dumb default behavior # of melt) to character data_narrow_baseR$treatment_group <- as.character(data_narrow_baseR$treatment_group) # then we split treatment_group into a list, aka the most unintuitive data type in R l <- strsplit(data_narrow_baseR$treatment_group, '_') # then we convert the list into a matrix, with three columns m <- matrix(unlist(l), nrow=length(l), ncol=3, byrow=T) # convert the matrix into a data frame df <- data.frame(m) colnames(df) <- c('gene', 'treatment', 'rep') # and give the columns of the data frame the names we want # finally, reassemble our original data frame with our new columns replacing the original "treatment_group" data_narrow_baseR <- cbind(data_narrow_baseR[c('Line', 'Elapsed', 'Date', 'Experiment')], df, data_narrow_baseR['confluence']) # let's compare base R data frame to the original Tidyverse one print(head(data_narrow_baseR)) print(head(data_narrow)) # are all elements the same between the data frames? table(data_narrow == data_narrow_baseR) # yes - and it only took all this insanely opaque code to get here! ### Plotting raw confluence data (Figure 1) ### fig1 <- ggplot(data_narrow, aes(x=Elapsed, y=confluence, group=interaction(Experiment, gene, treatment, rep), color=interaction(gene, treatment))) + geom_point(alpha=0.7) + geom_line(alpha=0.7) + scale_color_manual(values=c('greenyellow', 'orange', 'green4', 'orangered3')) + theme(aspect.ratio=1, legend.position='top', legend.title = element_blank(), legend.text = element_text(size=10)) + facet_wrap(facets=vars(Line), scales='free') fig1 ### Using "summarize" to operate across groups of data - a toy example ### data(mpg) # mileage data from 1999-2008 print(mpg) # just to check what the data look like # first we need to group the cars mpg_by_cyl <- group_by(mpg, cyl) %>% print() # this has created a new tibble, but we can easily confirm that all elements of the data frame itself # are preserved from the original: table(mpg_by_cyl==mpg) # identify and summarize car models within each cylinder group mpg_cyl_summarize <- summarize(mpg_by_cyl, n=n(), hwy_mean=mean(hwy), hwy_sd=sd(hwy), displ_mean=mean(displ), displ_sd=sd(displ)) print(mpg_cyl_summarize) # first, without creating a new data frame, we will call "group_by" on the original data, # then pipe it to "summarize" group_by(mpg, cyl) %>% summarize(n=n(), hwy_mean=mean(hwy), hwy_sd=sd(hwy), displ_mean=mean(displ), displ_sd=sd(displ)) ### Using "summarize" to combine technical replicates in our proliferation data ### print(data_narrow) # just to remind ourselves what the data look like # calculate mean confluence across technical replicates data_mean <- group_by(data_narrow, Line, Experiment, Date, Elapsed, gene, treatment) %>% summarize(conf_mean=mean(confluence), conf_sd=sd(confluence)) %>% print() data_mean <- ungroup(data_mean) # this will be necessary later. and is a good idea generally # the new data frame should be 1/3 the length of the original, since the triple replicates have been # collapsed down to one each - let's confirm nrow(data_mean) == nrow(data_narrow)/3 # first, add CV as a column to our data frame data_mean <- mutate(data_mean, conf_cv=conf_sd/conf_mean) ### Data inspection ### # create base plot p <- ggplot(data_mean, aes(x=conf_mean, y=conf_cv)) # plot with various color coding schemes p_line <- p + geom_point(aes(color=factor(Line)), alpha=0.5, size=1) p_gene <- p + geom_point(aes(color=gene), alpha=0.5, size=1) p_treatment <- p + geom_point(aes(color=treatment), alpha=0.5, size=1) plot_grid(p_line, p_gene, p_treatment, align='h') ### Plot mean proliferation data for each experiment (Figure 2) ### fig2 <- ggplot(data_mean, aes(x=Elapsed, y=conf_mean, group=interaction(Experiment, gene, treatment), color=interaction(gene, treatment))) + geom_point(alpha=0.7) + geom_line(alpha=0.7) + geom_errorbar(aes(ymin=conf_mean-conf_sd, ymax=conf_mean+conf_sd)) + scale_color_manual(values=c('greenyellow', 'orange', 'green4', 'orangered3')) + expand_limits(y=0) + theme(aspect.ratio=1, legend.position='top', legend.title = element_blank(), legend.text = element_text(size=10)) + facet_wrap(facets=vars(Line), scales='free') fig2 ### Using "filter" to examine subsets of data - a toy example ### # filter mpg data for top 25% highest highway mileage of automatic transmission cars, and graph # the number of models per manufacturer filter(mpg, str_detect(trans, 'auto')) %>% filter(hwy > quantile(hwy, 0.75)) %>% count(manufacturer) %>% ggplot(aes(x=reorder(manufacturer, -n), y=n)) + geom_bar(stat='identity') + xlab('manufacturer') + ylab('number of models') # doing the same thing with multiple steps mpg_auto <- filter(mpg, str_detect(trans, 'auto')) %>% print() mpg_auto_best <- filter(mpg_auto, hwy > quantile(hwy, 0.75)) %>% print() mpg_auto_best_who <- count(mpg_auto_best, manufacturer) %>% print() ggplot(mpg_auto_best_who, aes(x=reorder(manufacturer, -n), y=n)) + geom_bar(stat='identity') + xlab('manufacturer') + ylab('number of models') ### Using "filter" to explore our proliferation data ### # take a quick look at our data in its current form fig2 # filter for start and end of experiment, and graph mean confluence filter(data_mean, Elapsed == min(Elapsed)| Elapsed==max(Elapsed)) filter(data_mean, Elapsed == min(Elapsed)| Elapsed==max(Elapsed)) %>% ggplot(aes(x=Elapsed, y=conf_mean, group=interaction(Experiment, gene, treatment), color=interaction(gene, treatment))) + geom_point() + geom_line() + scale_color_manual(values=c('greenyellow', 'orange', 'green4', 'orangered3')) + expand_limits(y=0) + theme(aspect.ratio=1, legend.position='top', legend.title = element_blank(), legend.text = element_text(size=10)) + facet_wrap(facets=vars(Line), scales='free') ### Transforming our data by normalization to t=0 ### # create new data frame for initial confluence values, get rid of extraneous columns baseline <- filter(data_mean, Elapsed==0) %>% rename(initial=conf_mean) %>% select(-Elapsed, -conf_sd, -conf_cv) %>% print() # create new data set 'data_fold' by adding initial confluence variable to data_mean # inner_join will re-use same 'initial' variables across all timepoints data_fold <- inner_join(data_mean, baseline) %>% print() # create new variable for fold-change over initial confluence (also standard deviation) data_fold <- mutate(data_fold, fold=conf_mean/initial, fold_sd=conf_sd/initial) %>% print() # plot fold-change (with SD) for each cell line/experiment (Figure 3) fig3 <- ggplot(data_fold, aes(x=Elapsed, y=fold, group=interaction(Experiment, gene, treatment), color=interaction(gene, treatment))) + geom_point(alpha=0.7) + geom_line(alpha=0.7) + geom_errorbar(aes(ymin=fold-fold_sd, ymax=fold+fold_sd)) + scale_color_manual(values=c('greenyellow', 'orange', 'green4', 'orangered3')) + expand_limits(y=0) + theme(aspect.ratio=1) + ylab('net growth') + theme(aspect.ratio=1, legend.position='top', legend.title = element_blank(), legend.text = element_text(size=10)) + facet_wrap(facets=vars(Line), scales='free') fig3 ### Summarizing across experiments ### # calculate mean fold-change (and SD) across experiments data_fold_mean <- group_by(data_fold, Line, Elapsed, gene, treatment) %>% summarize(fold_mean=mean(fold), fold_mean_sd=sd(fold)) %>% print() ungroup(data_fold_mean) # plot mean fold-change (with SD) for each cell line (Figure 4) fig4 <- ggplot(data_fold_mean, aes(x=Elapsed, y=fold_mean, group=interaction(gene, treatment), color=interaction(gene, treatment))) + geom_point(alpha=0.7) + geom_line(alpha=0.7) + geom_errorbar(aes(ymin=fold_mean-fold_mean_sd, ymax=fold_mean+fold_mean_sd)) + scale_color_manual(values=c('greenyellow', 'orange', 'green4', 'orangered3')) + expand_limits(y=0) + theme(aspect.ratio=1) + ylab('net growth') + theme(aspect.ratio=1, legend.position='top', legend.title = element_blank(), legend.text = element_text(size=10)) + facet_wrap(facets=vars(Line), scales='free') fig4 # calculate maximum mean fold change within each cell line, add half-max value to graphs max_mean <- group_by(data_fold_mean, Line) %>% summarize(max=max(fold_mean)) %>% print() fig4 <- fig4 + geom_hline(data=max_mean, aes(yintercept=max/2), col='darkgray', lty='dashed') fig4 ### Plot all our data from raw to transformed ### plot_grid(fig1 + theme(legend.position = "none"), fig2 + theme(legend.position = "none"), fig3 + theme(legend.position = "none"), fig4 + theme(legend.position = "none"), labels=c('Fig1', 'Fig2', 'Fig3', 'Fig4'), label_size=12)