#!/usr/bin/env Rscript ## # General Setup ## # set the working directory setwd("/YOUR/FILE/PATH/") ## # Packages ## # install packages, if necessary if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("edgeR") install.packages("ggplot2") install.packages("ghibli") install.packages("ggVennDiagram") # import libraries library(edgeR) library(ggplot2) library(ghibli) library(ggVennDiagram) ## # Plotting Palettes ## # change the graphical parameters par(mfrow=c(9,3)) # view all available ghibli palettes for(i in names(ghibli_palettes)) print(ghibli_palette(i)) # close the plot and return the display to the default graphical parameters dev.off() # retrieve the vector of colors associated with PonyoMedium ghibli_colors <- ghibli_palette("PonyoMedium", type = "discrete") # view the selected color palette ghibli_colors # vector with a subset of colors associated with PonyoMedium ghibli_subset <- c(ghibli_colors[3], ghibli_colors[6], ghibli_colors[4]) ## # Data ## # import gene count data tribolium_counts <- read.csv("TriboliumCounts.csv", row.names="X") ## # Pairwise Setup ## # add grouping factor group <- factor(c(rep("cntrl_4h",3), rep("treat_4h",3), rep("cntrl_24h",3), rep("treat_24h",3))) # create DGE list object list <- DGEList(counts=tribolium_counts,group=group) ## # Pairwise Normalization ## # plot the library sizes before normalization barplot(list$samples$lib.size*1e-6, names=1:12, ylab="Library size (millions)") # filter the list of gene counts based on expression levels keep <- filterByExpr(list) # view the number of filtered genes table(keep) # remove genes that are not expressed in either experimental condition list <- list[keep, , keep.lib.sizes=FALSE] # calculate scaling factors list <- calcNormFactors(list) # compute counts per million (CPM) using normalized library sizes normList <- cpm(list, normalized.lib.sizes=TRUE) ## # Pairwise Data Exploration ## # vector of shape numbers for the MDS plot points <- c(0,1,15,16) # vector of colors for the MDS plot colors <- rep(c(ghibli_colors[3], ghibli_colors[6]), 2) # add extra space to right of plot area and change clipping to figure par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) # MDS plot with distances approximating log2 fold changes plotMDS(list, col=colors[group], pch=points[group]) # place the legend outside the right side of the plot legend("topright", inset=c(-0.4,0), legend=levels(group), pch=points, col=colors) # close the plot dev.off() # calculate the log CPM of the gene count data logcpm <- cpm(list, log=TRUE) # draw a heatmap of individual RNA-seq samples using moderated log CPM heatmap(logcpm) ## # Pairwise Fitting ## # estimate common dispersion and tagwise dispersions to produce a matrix of pseudo-counts list <- estimateDisp(list) # plot dispersion estimates and biological coefficient of variation plotBCV(list) ## # Pairwise Contrasts ## ### ## treat_4h vs cntrl_4h ### # perform an exact test for treat_4h vs cntrl_4h tested_4h <- exactTest(list, pair=c("cntrl_4h", "treat_4h")) # view the total number of differentially expressed genes at a p-value of 0.05 summary(decideTests(tested_4h)) # plot log-fold change against log-counts per million with DE genes highlighted plotMD(tested_4h) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # create a results table of DE genes resultsTbl_4h <- topTags(tested_4h, n=nrow(tested_4h$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression resultsTbl_4h$topDE <- "NA" # identify significantly up DE genes resultsTbl_4h$topDE[resultsTbl_4h$logFC > 1 & resultsTbl_4h$FDR < 0.05] <- "Up" # identify significantly down DE genes resultsTbl_4h$topDE[resultsTbl_4h$logFC < -1 & resultsTbl_4h$FDR < 0.05] <- "Down" # create volcano plot ggplot(data=resultsTbl_4h, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) ### ## treat_24h vs cntrl_24h ### # perform an exact test for treat_24h vs cntrl_24h tested_24h <- exactTest(list, pair=c("cntrl_24h", "treat_24h")) # view the total number of differentially expressed genes at a p-value of 0.05 summary(decideTests(tested_24h)) # plot log-fold change against log-counts per million with DE genes highlighted plotMD(tested_24h) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # create a results table of DE genes resultsTbl_24h <- topTags(tested_24h, n=nrow(tested_24h$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression resultsTbl_24h$topDE <- "NA" # identify significantly up DE genes resultsTbl_24h$topDE[resultsTbl_24h$logFC > 1 & resultsTbl_24h$FDR < 0.05] <- "Up" # identify significantly down DE genes resultsTbl_24h$topDE[resultsTbl_24h$logFC < -1 & resultsTbl_24h$FDR < 0.05] <- "Down" # create volcano plot ggplot(data=resultsTbl_24h, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR resultsTbl_24h.keep <- resultsTbl_24h$FDR < 0.05 # create filtered results table of DE genes resultsTbl_24h_filtered <- resultsTbl_24h[resultsTbl_24h.keep,] ### ## treat_4h vs treat_24h ### # perform an exact test for treat_4h vs treat_24h tested_treat <- exactTest(list, pair=c("treat_24h", "treat_4h")) # view the total number of differentially expressed genes at a p-value of 0.05 summary(decideTests(tested_treat)) # plot log-fold change against log-counts per million with DE genes highlighted plotMD(tested_treat) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # create a results table of DE genes resultsTbl_treat <- topTags(tested_treat, n=nrow(tested_treat$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression resultsTbl_treat$topDE <- "NA" # identify significantly up DE genes resultsTbl_treat$topDE[resultsTbl_treat$logFC > 1 & resultsTbl_treat$FDR < 0.05] <- "Up" # identify significantly down DE genes resultsTbl_treat$topDE[resultsTbl_treat$logFC < -1 & resultsTbl_treat$FDR < 0.05] <- "Down" # create volcano plot ggplot(data=resultsTbl_treat, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR resultsTbl_treat.keep <- resultsTbl_treat$FDR < 0.05 # create filtered results table of DE genes resultsTbl_treat_filtered <- resultsTbl_treat[resultsTbl_treat.keep,] ### ## cntrl_4h vs cntrl_24h ### # perform an exact test for cntrl_4h vs cntrl_24h tested_cntrl <- exactTest(list, pair=c("cntrl_24h", "cntrl_4h")) # view the total number of differentially expressed genes at a p-value of 0.05 summary(decideTests(tested_cntrl)) # plot log-fold change against log-counts per million with DE genes highlighted plotMD(tested_cntrl) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # create a results table of DE genes resultsTbl_ncntrl <- topTags(tested_cntrl, n=nrow(tested_cntrl$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression resultsTbl_ncntrl$topDE <- "NA" # identify significantly up DE genes resultsTbl_ncntrl$topDE[resultsTbl_ncntrl$logFC > 1 & resultsTbl_ncntrl$FDR < 0.05] <- "Up" # identify significantly down DE genes resultsTbl_ncntrl$topDE[resultsTbl_ncntrl$logFC < -1 & resultsTbl_ncntrl$FDR < 0.05] <- "Down" # create volcano plot ggplot(data=resultsTbl_ncntrl, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR resultsTbl_cntrl.keep <- resultsTbl_ncntrl$FDR < 0.05 # create filtered results table of DE genes resultsTbl_cntrl_filtered <- resultsTbl_ncntrl[resultsTbl_cntrl.keep,] ## # Pairwise Results Exploration ## # retrieve set of DE gene names for 24h contrast geneSet_24h <- rownames(resultsTbl_24h_filtered) # retrieve set of DE gene names for treat contrast geneSet_treat <- rownames(resultsTbl_treat_filtered) # retrieve set of DE gene names for cntrl contrast geneSet_cntrl <- rownames(resultsTbl_cntrl_filtered) # create combined list of DE gene names list_venn <- list(h24 = geneSet_24h, treat = geneSet_treat, cntrl = geneSet_cntrl) # create venn diagram ggVennDiagram(list_venn, label_alpha=0.25, category.names = c("24h","treat","cntrl")) + scale_color_brewer(palette = "Paired") ## # GLM Design ## # import grouping factor glm_targets <- read.csv(file="groupingFactors_tribolium.csv", row.names="sample") # setup a design matrix glm_group <- factor(paste(glm_targets$treatment, glm_targets$hours, sep=".")) # create DGE list object glm_list <- DGEList(counts=tribolium_counts, group=glm_group) # add the sample names colnames(glm_list) <- rownames(glm_targets) # parametrize the experimental design with a one-way layout glm_design <- model.matrix(~ 0 + glm_group) # add group names colnames(glm_design) <- levels(glm_group) ## # GLM Normalization ## # filter the list of gene counts based on expression levels glm_keep <- filterByExpr(glm_list) # view the number of filtered genes table(glm_keep) # remove genes that are not expressed in either experimental condition glm_list <- glm_list[glm_keep, , keep.lib.sizes=FALSE] # calculate scaling factors glm_list <- calcNormFactors(glm_list) # compute counts per million (CPM) using normalized library sizes norm_glm_list <- cpm(glm_list, normalized.lib.sizes=TRUE) ## # GLM Fitting ## # estimate common dispersion and tagwise dispersions to produce a matrix of pseudo-counts glm_list <- estimateDisp(glm_list, glm_design, robust=TRUE) # estimate the QL dispersions glm_fit <- glmQLFit(glm_list, glm_design, robust=TRUE) # plot the QL dispersions plotQLDisp(glm_fit) ## # GLM Contrasts ## ### ## treatment ### # examine the overall effect of treatment con.treatment <- makeContrasts(set.treatment = (treat.4h + treat.24h)/2 - (cntrl.4h + cntrl.24h)/2, levels=glm_design) # conduct gene wise statistical tests anov.treatment <- glmTreat(glm_fit, contrast=con.treatment) # view summary of DE genes summary(decideTests(anov.treatment)) # create MD plot of DE genes plotMD(anov.treatment) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # generate table of DE genes tagsTbl_treatment <- topTags(anov.treatment, n=nrow(anov.treatment$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression tagsTbl_treatment$topDE <- "NA" # identify significantly up DE genes tagsTbl_treatment$topDE[tagsTbl_treatment$logFC > 1 & tagsTbl_treatment$FDR < 0.05] <- "UP" # identify significantly down DE genes tagsTbl_treatment$topDE[tagsTbl_treatment$logFC < -1 & tagsTbl_treatment$FDR < 0.05] <- "DOWN" # create volcano plot ggplot(data=tagsTbl_treatment, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR tagsTbl_treatment.glm_keep <- tagsTbl_treatment$FDR < 0.05 # create filtered results table of DE genes tagsTbl_treatment_filtered <- tagsTbl_treatment[tagsTbl_treatment.glm_keep,] ### ## hours ### # examine the overall effect of hours con.hours <- makeContrasts(set.hours = (cntrl.24h + treat.24h)/2 - (cntrl.4h + treat.4h)/2, levels=glm_design) # conduct gene wise statistical tests anov.hours <- glmTreat(glm_fit, contrast=con.hours) # view summary of DE genes summary(decideTests(anov.hours)) # create MD plot of DE genes plotMD(anov.hours) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # generate table of DE genes tagsTbl_hours <- topTags(anov.hours, n=nrow(anov.hours$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression tagsTbl_hours$topDE <- "NA" # identify significantly up DE genes tagsTbl_hours$topDE[tagsTbl_hours$logFC > 1 & tagsTbl_hours$FDR < 0.05] <- "UP" # identify significantly down DE genes tagsTbl_hours$topDE[tagsTbl_hours$logFC < -1 & tagsTbl_hours$FDR < 0.05] <- "DOWN" # create volcano plot ggplot(data=tagsTbl_hours, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR tagsTbl_hours.glm_keep <- tagsTbl_hours$FDR < 0.05 # create filtered results table of DE genes tagsTbl_hours_filtered <- tagsTbl_hours[tagsTbl_hours.glm_keep,] ### ## interaction ### # examine any interaction effect con.interaction <- makeContrasts(set.interaction = ((treat.4h + treat.24h)/2 - (cntrl.4h + cntrl.24h)/2) - ((cntrl.24h + treat.24h)/2 - (cntrl.4h + treat.4h)/2), levels=glm_design) # conduct gene wise statistical tests anov.interaction <- glmTreat(glm_fit, contrast=con.interaction) # view summary of DE genes summary(decideTests(anov.interaction)) # create MD plot of DE genes plotMD(anov.interaction) # add blue lines to indicate 2-fold changes abline(h=c(-1, 1), col="blue") # generate table of DE genes tagsTbl_inter <- topTags(anov.interaction, n=nrow(anov.interaction$table), adjust.method="fdr")$table # add column for identifying direction of DE gene expression tagsTbl_inter$topDE <- "NA" # identify significantly up DE genes tagsTbl_inter$topDE[tagsTbl_inter$logFC > 1 & tagsTbl_inter$FDR < 0.05] <- "UP" # identify significantly down DE genes tagsTbl_inter$topDE[tagsTbl_inter$logFC < -1 & tagsTbl_inter$FDR < 0.05] <- "DOWN" # create volcano plot ggplot(data=tagsTbl_inter, aes(x=logFC, y=-log10(FDR), color = topDE)) + geom_point() + theme_minimal() + scale_colour_discrete(type = ghibli_subset, breaks = c("Up", "Down")) # identify significantly DE genes by FDR tagsTbl_inter.glm_keep <- tagsTbl_inter$FDR < 0.05 # create filtered results table of DE genes tagsTbl_inter_filtered <- tagsTbl_inter[tagsTbl_inter.glm_keep,] ## # GLM Results Exploration ## # retrieve set of DE gene names for hours contrast geneSet_hours <- rownames(tagsTbl_hours_filtered) # retrieve set of DE gene names for interaction contrast geneSet_interaction <- rownames(tagsTbl_inter_filtered) # create combined glm_list of DE gene names glm_list_venn <- list(hours = geneSet_hours, interaction = geneSet_interaction) # create venn diagram ggVennDiagram(glm_list_venn, label_alpha=0.25, category.names = c("hours","interaction")) + scale_color_brewer(palette = "Paired") ## # Saving Plots ## # save the glm results venn diagram plot to a file png("glm_tribolium_venn.png") ggVennDiagram(glm_list_venn, label_alpha=0.25, category.names = c("hours","interaction")) + scale_color_brewer(palette = "Paired") dev.off() ## # Saving Tables ## # write the table of normalized counts to a file write.table(norm_glm_list, file="Tribolium_normalizedCounts.csv", sep=",", row.names=TRUE)