--- title: "Geoduck DNA Methylation" author: "HPutnam" date: "05/14/2020" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) ``` ## Load Libraries ```{r message = FALSE, warning = FALSE} library(plotrix) library(ggplot2) library(gridExtra) library(seacarb) library(pheatmap) library(dplyr) library(tidyverse) library(tidyr) library(goseq) library(genefilter) library(gplots) library(cowplot) library(lsmeans) library(data.table) library(RColorBrewer) library(randomcoloR) #library(GSEABase) library(ggpubr) library(hrbrthemes) library(viridis) ``` Flow Rates ```{r Flow Rates} # FLOW RATES #Initial Exposure: Quantifying flow and testing for differences among treatments Flow1 <- read.csv("RAnalysis/Data/Flow1_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA #flow values measured in ml/10 sec Flow1$Rate <- (((Flow1$Rate1 + Flow1$Rate2)/2)*6)/1000*60 #calculate flow rate in liters per hour Flow1.Rate <- aggregate(Rate ~ Treatment, data=Flow1, FUN=mean) #mean flow by treatment Flow1.Rate$se <- aggregate(Rate ~ Treatment, data=Flow1, FUN=std.error) #sem flow by treatment mean(Flow1$Rate) #mean flow across all tanks std.error(Flow1$Rate) #sem flow across all tanks length(Flow1$Rate) #n flow across all tanks FL1 <- aov(Rate ~ Treatment, data=Flow1) #test for differences in flow rate between treatments FL1.res <- anova(FL1) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(FL1$residuals) #plot histogram of residuals boxplot(FL1$residuals) #plot boxplot of residuals plot(FL1) #display residuals versus fitter, normal QQ plot, leverage plot #Secondary Exposure: Quantifying flow and testing for differences among treatments Flow2 <- read.csv("RAnalysis/Data/Flow2_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA #flow values measured in ml/15 sec Flow2$Rate <- ((Flow2$Rate1)*4)/1000*60 #calculate flow rate in liters per hour Flow2.Rate <- aggregate(Rate ~ Treatment, data=Flow2, FUN=mean) #mean flow by treatment Flow2.Rate$se <- aggregate(Rate ~ Treatment, data=Flow2, FUN=std.error) #sem flow by treatment mean(Flow2$Rate) #mean flow across all tanks std.error(Flow2$Rate) #sem flow across all tanks length(Flow2$Rate) #n flow across all tanks FL2 <- aov(Rate ~ Treatment, data=Flow2) #test for differences in flow rate between treatments FL2.res <-anova(FL2) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(FL2$residuals) #plot histogram of residuals boxplot(FL2$residuals) #plot boxplot of residuals plot(FL2) #display residuals versus fitter, normal QQ plot, leverage plot ``` Feeding Rates ```{r Feeding Rates} # FEEDING RATES #Initial Exposure: Quantifying feeding and testing for differences among treatments cells.1 <- read.csv("RAnalysis/Data/Cell_Counts_Juvenile_Geoduck_Exp1.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA cells.1$Avg.Cells <- rowMeans(cells.1[,c("Count1", "Count2")], na.rm = TRUE) #calculate average of counts cells.1$cell.num <- cells.1$Avg.Cells/cells.1$Volume.Counted #calculate density avg.cells.tank.1 <- do.call(data.frame,aggregate(cell.num ~ Tank, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Tank avg.cells.trt.1 <- do.call(data.frame,aggregate(cell.num ~ Treatment, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment avg.cells.1 <- do.call(data.frame,aggregate(cell.num ~ Volume.Counted, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment colnames(avg.cells.tank.1) <- c("Tank", "mean", "se") #rename columns colnames(avg.cells.trt.1) <- c("Treatment", "mean", "se") #rename columns cells1.tank <- aov(cell.num ~Tank, data=cells.1) #test the hypothesis feeding does not differ between tanks cells1.tank.res <-anova(cells1.tank) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(cells1.tank$residuals) #plot histogram of residuals boxplot(cells1.tank$residuals) #plot boxplot of residuals plot(cells1.tank) #display residuals versus fitter, normal QQ plot, leverage plot cells1.trt <- aov(cell.num ~Treatment, data=cells.1) #test the hypothesis feeding does not differ between treatments cells1.trt.res <-anova(cells1.trt) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(cells1.trt$residuals) #plot histogram of residuals boxplot(cells1.trt$residuals) #plot boxplot of residuals plot(cells1.trt) #display residuals versus fitter, normal QQ plot, leverage plot #Secondary Exposure: Quantifying feeding and testing for differences among treatments cells.2 <- read.csv("RAnalysis/Data/Cell_Counts_Juvenile_Geoduck_Exp2.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA cells.2$cell.num <- rowMeans(cells.2[,c("Count1", "Count2", "Count3")], na.rm = TRUE) #calculate average of counts avg.cells.tank.2 <- do.call(data.frame,aggregate(cell.num ~ Tank, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Tank avg.cells.trt.2 <- do.call(data.frame,aggregate(cell.num ~ Treatment, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment avg.cells.2 <- do.call(data.frame,aggregate(cell.num ~ Notes, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment colnames(avg.cells.tank.2) <- c("Tank", "mean", "se") #rename columns colnames(avg.cells.trt.2) <- c("Treatment", "mean", "se") #rename columns cells2.tank <- aov(cell.num ~Tank, data=cells.2) #test the hypothesis feeding does not differ between tanks cells2.tank.res <-anova(cells2.tank) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(cells2.tank$residuals) #plot histogram of residuals boxplot(cells2.tank$residuals) #plot boxplot of residuals plot(cells2.tank) #display residuals versus fitter, normal QQ plot, leverage plot cells2.trt <- aov(cell.num ~Treatment, data=cells.2) #test the hypothesis feeding does not differ between treatments cells2.trt.res <-anova(cells2.trt) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(cells2.trt$residuals) #plot histogram of residuals boxplot(cells2.trt$residuals) #plot boxplot of residuals plot(cells2.trt) #display residuals versus fitter, normal QQ plot, leverage plot ``` Continuous pH data ```{r Continuous pH} #CONTINUOUS EXPERIMENTAL PH DATA #DuraFET_pH_data.csv pH <- read.csv("RAnalysis/Data/DuraFET_pH_data.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA pH$Date.Time <-as.POSIXct(pH$Date.Time, format="%m/%d/%y %H:%M") #convert date format pH$Date <- as.Date(pH$Date.Time) #convert Date only pH$Time <- format(as.POSIXct(pH$Date.Time) ,format = "%H:%M:%S") #convert Time only pH.low <- do.call(data.frame,aggregate(Low ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day pH.med <- do.call(data.frame,aggregate(Medium ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day pH.amb <- do.call(data.frame,aggregate(Ambient ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day pH.low$Treatment <- "pH 7.0" #Add treatment Information colnames(pH.low) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format pH.med$Treatment <- "pH 7.4" #Add treatment Information colnames(pH.med) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format pH.amb$Treatment <- "pH 7.9" #Add treatment Information colnames(pH.amb) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format daily.pH <- rbind(pH.amb, pH.med, pH.low) #bind treatment data daily.pH #view data # Plot daily averages of pH data for the complete experiment All.pH <- ggplot(daily.pH, aes(x=Date, y=mean, group=Treatment)) + #set up plot information geom_errorbar(aes(ymin=daily.pH$mean-daily.pH$se, ymax=daily.pH$mean+daily.pH$se), colour="black", width=.1, position = position_dodge(width = 0.05)) + #add standard error bars about the mean geom_point(aes(shape=Treatment), size = 2, position = position_dodge(width = 0.05)) + #include points in the shape of the treatments annotate("text", x=as.Date("2016-03-25"), y=7.95, label = "No Data") + #add text to the graphic where data are missing annotate("text", x=as.Date("2016-06-15"), y=7.95, label = "No Data") + #add text to the graphic where data are missing xlab("Time") + #label x axis ylab("pH Total Scale") + # label y axis ylim(6.8,8.1) + # set y axis scale geom_vline(xintercept = 16899, linetype="dotted", color = "gray", size=1) + #add vertical line geom_vline(xintercept = 16928, linetype="dashed", color = "gray", size=1) + #add vertical line geom_vline(xintercept = 17013, linetype="dotted", color = "gray", size=1) + #add vertical line theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.position=c(.65, .25), #set legend location legend.text = element_text(size = 8), #set the legend text size legend.key = element_blank(), #remove the legend background legend.title = element_text(size=8, face="bold")) + #set legend title attributes ggtitle("A) Experimental pH") + #add a main title theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) #set title attributes All.pH #view plot ``` Continuous Temperature data - Avtech ```{r Continuous Temperature} #CONTINUOUS EXPERIMENTAL TEMPERATURE DATA #Avtech_temp_data.csv temp <- read.csv("RAnalysis/Data/Avtech_temp_data.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA temp$Date.Time <-as.POSIXct(temp$Date.Time, format="%m/%d/%y %H:%M") #convert date format temp$Date <- as.Date(temp$Date.Time) #convert Date only temp$Time <- format(as.POSIXct(temp$Date.Time) ,format = "%H:%M:%S") #convert time only temp.low <- do.call(data.frame,aggregate(Low ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day temp.med <- do.call(data.frame,aggregate(Medium ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day temp.amb <- do.call(data.frame,aggregate(Ambient ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day temp.low$Treatment <- "Super Low" #Add treatment Information colnames(temp.low) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format temp.med$Treatment <- "Low" #Add treatment Information colnames(temp.med) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format temp.amb$Treatment <- "Ambient" #Add treatment Information colnames(temp.amb) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format daily.temp <- rbind(temp.amb, temp.med, temp.low) #bind treatment data daily.temp #view data All.temp <- ggplot(daily.temp, aes(x=Date, y=mean, group=Treatment)) + #set up plot information geom_errorbar(aes(ymin=daily.temp$mean-daily.temp$se, ymax=daily.temp$mean+daily.temp$se), colour="black", width=.1, position = position_dodge(width = 0.05)) + #add standard error bars about the mean geom_point(aes(shape=Treatment), size = 2, position = position_dodge(width = 0.05)) + #include points in the shape of the treatments annotate("text", x=as.Date("2016-05-25"), y=14, label = "No Data") + #add text to the graphic where data are missing xlab("Time") + #label x axis ylab("Temperature °C") + #label y axis ylim(0,20) + # set y axis scale geom_vline(xintercept = 16899, linetype="dotted", color = "gray", size=1) + #add vertical line geom_vline(xintercept = 16928, linetype="dashed", color = "gray", size=1) + #add vertical line geom_vline(xintercept = 17013, linetype="dotted", color = "gray", size=1) + #add vertical line theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.position = "none") + #remove legend ggtitle("B) Experimental Temperature") + #add a main title theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) #set title attributes All.temp #view plot FigureS1 <- arrangeGrob(All.pH, All.temp, ncol=1) ggsave(file="RAnalysis/Figs/Supplementary_Figs/Fig.S5.pdf", FigureS1, width = 6, height = 8, units = c("in")) ``` Discrete Seawater Chemistry ```{r Discrete Seawater Chemistry} #SEAWATER CHEMISTRY ANALYSIS FOR DISCRETE MEASUREMENTS #pH Tris Calibration Curves #Data to calculate conversion equations from mV to total scale using tris standard for pH probe path <-("RAnalysis/Data/pH_Calibration_Files/") #set path to calibration file folder file.names<-list.files(path = path, pattern = "csv$") #list all the file names with csv pH.cals <- data.frame(matrix(NA, nrow=length(file.names), ncol=4, dimnames=list(file.names,c("Date", "Intercept", "Slope","R2")))) #generate an empty 3 column dataframe with specific column names for(i in 1:length(file.names)) { # for every file in list start at the first and run this following function Calib.Data <-read.table(file.path(path,file.names[i]), header=TRUE, sep=",", na.string="NA", as.is=TRUE) #reads in the data files model <-lm(mVTris ~ TTris, data=Calib.Data) #runs a linear regression of mV as a function of temperature coe <- coef(model) #extracts the coeffecients R <- summary(model)$r.squared #extracts the R2 pH.cals[i,2:3] <- coe #inserts coef in the dataframe pH.cals[i,4] <- R #inserts R2 in the dataframe pH.cals[i,1] <- substr(file.names[i],1,8) #stores the file name in the Date column } colnames(pH.cals) <- c("Calib.Date", "Intercept", "Slope", "R2") #names the columns of the dataframe pH.cals #view data #Calculate Total pH SW.chem <- read.csv("RAnalysis/Data/SW_Chem_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA SW.chem <- merge(pH.cals, SW.chem, by="Calib.Date") #merge pH calibrations with Seawater chemistry data R <- 8.31447215 #gas constant in J mol-1 K-1 F <-96485.339924 #Faraday constant in coulombs mol-1 mvTris <- SW.chem$Temperature*SW.chem$Slope+SW.chem$Intercept #calculate the mV of the tris standard using the temperature mv relationships in the measured standard curves STris<-27.5 #salinity of the Tris from local batch made for Manchester WA salinity conditions in Spring 2016. Batch date = 20160214 phTris<- (11911.08-18.2499*STris-0.039336*STris^2)*(1/(SW.chem$Temperature+273.15))-366.27059+ 0.53993607*STris+0.00016329*STris^2+(64.52243-0.084041*STris)*log(SW.chem$Temperature+273.15)-0.11149858*(SW.chem$Temperature+273.15) #calculate the pH of the tris (Dickson A. G., Sabine C. L. and Christian J. R., SOP 6a) SW.chem$pH.Total<-phTris+(mvTris/1000-SW.chem$pH.MV/1000)/(R*(SW.chem$Temperature+273.15)*log(10)/F) #calculate the pH on the total scale (Dickson A. G., Sabine C. L. and Christian J. R., SOP 6a) #calculate mean TA from titrations Trts.TA <- SW.chem %>% group_by(Treatment.Group, Exposure) %>% summarise(mean = mean(Total.Alkalinity,na.rm = TRUE),sem=std.error(Total.Alkalinity,na.rm = TRUE), N= sum(!is.na(Total.Alkalinity))) TA.mod <- aov(Total.Alkalinity ~ Treatment, data=SW.chem) summary(TA.mod) par(mfrow=c(1,3)) hist(TA.mod$residuals) boxplot(TA.mod$residuals) plot(TA.mod$fitted.values,TA.mod$residuals) #apply mean TA value per treatment to new column for seacarb calculations # SW.chem %>% # mutate(TA.mean = case_when(Treatment.Group %in% Ambient ~ Trts.TA[1,3], # Treatment.Group %in% Mid ~ Trts.TA[4,3], # Treatment.Group %in% High ~ Trts.TA[3,3])) #Calculate CO2 parameters using seacarb carb.output <- carb(flag=8, var1=SW.chem$pH.Total, var2=SW.chem$TA.calc/1000000, S= SW.chem$Salinity, T=SW.chem$Temperature, P=0, Pt=0, Sit=0, pHscale="T", kf="pf", k1k2="l", ks="d") #calculate seawater chemistry parameters using seacarb carb.output$ALK <- carb.output$ALK*1000000 #convert to µmol kg-1 carb.output$CO2 <- carb.output$CO2*1000000 #convert to µmol kg-1 carb.output$HCO3 <- carb.output$HCO3*1000000 #convert to µmol kg-1 carb.output$CO3 <- carb.output$CO3*1000000 #convert to µmol kg-1 carb.output$DIC <- carb.output$DIC*1000000 #convert to µmol kg-1 carb.output <- cbind(SW.chem$Measure.Date, SW.chem$Exposure, SW.chem$Sample.ID, SW.chem$Treatment, carb.output) #combine the sample information with the seacarb output colnames(carb.output) <- c("Date", "Exposure", "Tank", "Treatment", "flag", "Salinity", "Temperature","Patm", "Pressure", "pH", "CO2", "fCO2", "pCO2", "fCO2pot","pCO2pot","fCOinsitu", "pCOinsitu","HCO3", "CO3", "DIC", "TA", "Aragonite.Sat", "Calcite.Sat") #Rename columns to describe contents carb.output <- subset(carb.output, select= c("Exposure","Date", "Tank", "Treatment", "Salinity", "Temperature", "pH", "CO2", "pCO2", "HCO3", "CO3", "DIC", "TA", "Aragonite.Sat")) #Calculate descriptive stats for seawater chemistry by Exposure mean.carb.output <- carb.output %>% group_by(Exposure, Treatment) %>% summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"), mean) sem.carb.output <- carb.output %>% group_by(Exposure, Treatment) %>% summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"), std.error) N.carb.output <- carb.output %>% group_by(Exposure, Treatment) %>% summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"), length) ``` Shell Size ```{r Shell Size} ##### Shell Size ##### JUVENILE GEODUCK SHELL SIZE ##### seed.size <- read.csv("RAnalysis/Data/Size_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA seed.size$Treatment <- factor(seed.size$Treatment, levels = c("pH 7.9", "pH 7.4", "pH 7.0")) seed.size$Secondary <- factor(seed.size$Secondary, levels = c("pH 7.9", "pH 7.4", "pH 7.0")) #Initial Exposure Experimental Component Initial <- subset(seed.size, Component=="Exp1", select = Date:Area) #subset dataframe to exposure 1 data Initial$Ratio <- Initial$Length/Initial$Width #calculate the length:width of the shell #calculating shell area means for Exposure 1 normalizations E1.norms <- aggregate(Area ~ Day*Treatment, data=Initial, mean) #calculate mean area by Day and Treatment E1.norm.area.amb <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.9", select = Area) #subset mean area for day 1 ambient condition E1.norm.area.med <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.4", select = Area) #subset mean area for day 1 medium condition E1.norm.area.low <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.0", select = Area) #subset mean area for day 1 low condition E1.norms <- function(x) { #write function if(x == "pH 7.9") y <- E1.norm.area.amb #if Treatment equals Ambient assign day 1 Ambient mean as normalization factor if(x == "pH 7.4") y <- E1.norm.area.med #if Treatment equals Medium assign day 1 Medium mean as normalization factor if(x == "pH 7.0") y <- E1.norm.area.low #if Treatment equals Low assign day 1 Low mean as normalization factor return(y) #return result } Initial$A.norm <- as.numeric(sapply(Initial$Treatment,E1.norms)) #add a column with the normalization values Initial$A.rel <- Initial$Area/Initial$A.norm #normalize the area to be relative size Init.area <- do.call(data.frame,aggregate(Area ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem Init.Len <- do.call(data.frame,aggregate(Length ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem Init.Wid <- do.call(data.frame,aggregate(Width ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem Init.Ratio <- do.call(data.frame,aggregate(Ratio ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem Init.ARel <- do.call(data.frame,aggregate(A.rel ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem Init.Shell.size <- cbind(Init.area, Init.Len$Length.mean, Init.Len$Length.se, Init.Wid$Width.mean, Init.Wid$Width.se, Init.Ratio$Ratio.mean, Init.Ratio$Ratio.se, Init.ARel$A.rel.mean, Init.ARel$A.rel.se) #bind columns into dataframe colnames(Init.Shell.size) <- c("Day", "Treatment", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns # Significance testing for initial exposure Init <- aov(log10(A.rel) ~ Treatment*Day, data=Initial) #test the hypothesis relative size does not differ between treatments and time Init.res <-anova(Init) #display anova results par(mfrow=c(3,2)) #set plotting configuration par(mar=c(1,1,1,1)) #set margins for plots hist(Init$residuals) #plot histogram of residuals boxplot(Init$residuals) #plot boxplot of residuals plot(Init) #display residuals versus fitted, normal QQ plot, leverage plot #PostHoc Tukey adjustment comparison of least-squares means E1.lsm <- lsmeans(Init, ~ Treatment*Day, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model E1.pairs <- multcomp::cld(E1.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display E1.pairs #view results # calculate percent difference in treatment relative to ambient ((Init.Shell.size$avg.Arel[2]-Init.Shell.size$avg.Arel[5])/Init.Shell.size$avg.Arel[2])*100 ((Init.Shell.size$avg.Arel[3]-Init.Shell.size$avg.Arel[6])/Init.Shell.size$avg.Arel[3])*100 #Exposure 1 Plotting Init.Shell.size$Day.Num <- c(1,10,23,1,10,23,1,10,23) #set days as numeric for plotting Fig.Exp1.size <- ggplot(Init.Shell.size, aes(x=Day.Num, y=avg.Arel, group=Treatment)) + geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.6)) + geom_line(aes(linetype=Treatment, colour = Treatment), size = 0.5, position = position_dodge(width = 0.6)) + scale_color_manual(values=c("blue", "mediumvioletred","red")) + geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.6)) + scale_fill_manual(values=c("blue","mediumvioletred","red")) + scale_x_continuous(breaks=seq(0,160,10)) + annotate("text", x=9, y=0.73, label = "a", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=1, y=1.2, label = "abc", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=11, y=1, label = "ab", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=c(9,22.5), y=c(1.2,0.95), label = "b", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=22.5, y=1.25, label = "bc", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=22, y=1.6, label = "c", size = 3) + #add text to the graphic for posthoc letters xlab("Days") + ylab(expression(paste("Relative Size"))) + ylim(0.5,3.5) + theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.key = element_blank(), #remove legend background legend.position=c(.5, .7)) + #set legend location ggtitle("A) Initial Exposure") + theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) Fig.Exp1.size #Common Garden Experimental Component CG.data <- subset(seed.size, Component=="Common.Garden", select = Date:Area) #subset dataframe to common garden data CG.data$Ratio <- CG.data$Length/CG.data$Width #calculate the length:width of the shell #calculating shell area means for Common Garden normalizations CG.norm.area.amb <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.9", select = avg.area) #subset mean area for day 23 ambient condition CG.norm.area.med <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.4", select = avg.area) #subset mean area for day 23 medium condition CG.norm.area.low <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.0", select = avg.area) #subset mean area for day 23 high condition CG.norms <- function(x) { if(x == "pH 7.9") y <- CG.norm.area.amb #if Treatment equals Ambient assign day 23 Ambient mean as normalization factor if(x == "pH 7.4") y <- CG.norm.area.med #if Treatment equals Medium assign day 23 Medium mean as normalization factor if(x == "pH 7.0") y <- CG.norm.area.low #if Treatment equals Low assign day 23 Low mean as normalization factor return(y) } CG.data$A.norm <- as.numeric(sapply(CG.data$Treatment,CG.norms)) #add a column with the normalization values CG.data$A.rel <- CG.data$Area/CG.data$A.norm #normalize the area to be relative size CG.area <- do.call(data.frame,aggregate(Area ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem CG.Len <- do.call(data.frame,aggregate(Length ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem CG.Wid <- do.call(data.frame,aggregate(Width ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem CG.Ratio <- do.call(data.frame,aggregate(Ratio ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem CG.ARel <- do.call(data.frame,aggregate(A.rel ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem CG.Shell.size <- cbind(CG.area, CG.Len$Length.mean, CG.Len$Length.se, CG.Wid$Width.mean, CG.Wid$Width.se, CG.Ratio$Ratio.mean, CG.Ratio$Ratio.se, CG.ARel$A.rel.mean, CG.ARel$A.rel.se) #bind columns into dataframe colnames(CG.Shell.size) <- c("Day", "Treatment", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns CG.Shell.size <- CG.Shell.size %>% filter(Day=="Day135") CG.ARel <- CG.ARel[c(1,3,5),] CG.data.135 <- CG.data %>% filter(Day=="Day135") # Significance testing for common garden CG <- aov(log10(A.rel) ~ Treatment, data=CG.data.135) CG.res <- anova(CG) CG.res par(mfrow=c(3,2)) par(mar=c(1,1,1,1)) hist(CG$residuals) boxplot(CG$residuals) plot(CG) #PostHoc Tukey adjustment comparison of least-squares means CG.lsm <- lsmeans(CG, ~ Treatment, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model CG.pairs <- multcomp::cld(CG.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display CG.pairs #view results # calculate fold difference in treatment relative to ambient for day 135 CG.Shell.size$avg.Arel[3]/CG.Shell.size$avg.Arel[1] CG.Shell.size$avg.Arel[2]/CG.Shell.size$avg.Arel[1] #Common Garden Plotting CG.Shell.size$Day.Num <- c(135,135,135) #set days as numeric for plotting Fig.CG.size <- ggplot(CG.Shell.size, aes(x=Day, y=avg.Arel, group=Treatment)) + geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.6)) + geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.6)) + scale_color_manual(values=c("blue", "mediumvioletred","red")) + annotate("text", x=0.8, y=2.55, label = "a", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=1, y=3.48, label = "b", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=1.2, y=3.3, label = "b", size = 3) + #add text to the graphic for posthoc letters xlab("Treatment") + ylab(expression(paste("Relative Size"))) + ylim(0.5,3.5) + theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.position='none') + #remove legend background ggtitle("B) Common Garden") + theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) Fig.CG.size #Exposure 2 Experimental Component ReExp <- subset(seed.size, Day=="Day145" | Day=="Day158", select = Date:Area) #subset dataframe to exposure 2 data ReExp$Ratio <- ReExp$Length/ReExp$Width #calculate the length:width of the shell #calculating mean for Exposure 2 normalizations E2.norm.area.amb <- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.9", select = avg.area) #subset mean area for day 135 ambient condition E2.norm.area.med <- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.4", select = avg.area) #subset mean area for day 135 medium condition E2.norm.area.low<- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.0", select = avg.area) #subset mean area for day 135 low condition E2.norms <- function(x) { if(x == "pH 7.9") y <- E2.norm.area.amb #if Treatment equals Ambient assign day 135 Ambient mean as normalization factor if(x == "pH 7.4") y <- E2.norm.area.med #if Treatment equals Ambient assign day 135 Medium mean as normalization factor if(x == "pH 7.0") y <- E2.norm.area.low #if Treatment equals Ambient assign day 135 Low mean as normalization factor return(y) } ReExp$A.norm <- as.numeric(sapply(ReExp$Treatment,E2.norms)) #add a column with the normalization values ReExp$A.rel <- ReExp$Area/ReExp$A.norm #normalize the area to be relative size ReExp.area <- do.call(data.frame,aggregate(Area ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem ReExp.Len <- do.call(data.frame,aggregate(Length ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem ReExp.Wid <- do.call(data.frame,aggregate(Width ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem ReExp.Ratio <- do.call(data.frame,aggregate(Ratio ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem ReExp.ARel <- do.call(data.frame,aggregate(A.rel ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem ReExp.Shell.size <- cbind(ReExp.area, ReExp.Len$Length.mean, ReExp.Len$Length.se, ReExp.Wid$Width.mean, ReExp.Wid$Width.se, ReExp.Ratio$Ratio.mean, ReExp.Ratio$Ratio.se, ReExp.ARel$A.rel.mean, ReExp.ARel$A.rel.se) #bind columns into dataframe colnames(ReExp.Shell.size) <- c("Treatment", "Secondary", "Day", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns # Significance testing for secondary exposure Rexps <- aov(log10(A.rel) ~ Day*Treatment*Secondary, data=ReExp) Rexp.res <- anova(Rexps) Rexp.res par(mfrow=c(3,2)) par(mar=c(1,1,1,1)) hist(Rexps$residuals) boxplot(Rexps$residuals) plot(Rexps) Day145 <- subset(ReExp.Shell.size, Day=="Day145", select = Treatment:se.Arel) Day158 <- subset(ReExp.Shell.size, Day=="Day158", select = Treatment:se.Arel) #PostHoc E2.lsm <- lsmeans(Rexps, ~ Day*Treatment*Secondary, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model E2.pairs <- multcomp::cld(E2.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display E2.pairs #view results #ReExp.Shell.size$group <- paste0(ReExp.Shell.size$Treatment, ReExp.Shell.size$Se) #Exposure2 Plotting Fig.Exp2.D10.size <- ggplot(Day145, aes(x=Secondary, y=avg.Arel, group=Treatment)) + geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.05)) + geom_line(aes(linetype=Treatment, colour=Treatment), size = 0.5, position = position_dodge(width = 0.05)) + scale_color_manual(values=c("blue", "mediumvioletred","red")) + geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.05)) + scale_fill_manual(values=c("blue", "mediumvioletred","red")) + annotate("text", x=0.85, y=1.3, label = "b", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=0.85, y=0.88, label = "a", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=c(0.85,2.2,2.2,2.25), y=c(0.94,0.94, 0.88,0.97), label = "ab", size = 3) + #add text to the graphic for posthoc letters xlab("Secondary Treatment") + ylab("Relative Size") + ylim(0.5,1.5) + theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.position='none') + #remove legend background ggtitle("C) Day 145") + theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) Fig.Exp2.D10.size #Exposure2 Plotting Fig.Exp2.D23.size <- ggplot(Day158, aes(x=Secondary, y=avg.Arel, group=Treatment)) + geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.05)) + geom_line(aes(linetype=Treatment, colour=Treatment), size = 0.5, position = position_dodge(width = 0.05)) + scale_color_manual(values=c("blue", "mediumvioletred","red")) + geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.05)) + scale_fill_manual(values=c("blue","mediumvioletred","red")) + annotate("text", x=2.2, y=0.78, label = "a", size = 3) + #add text to the graphic for posthoc letters annotate("text", x=c(0.85,0.85,0.85,2.2,2.2), y=c(0.95,1.04,1.15, 0.88,0.97), label = "ab", size = 3) + #add text to the graphic for posthoc letters xlab("Secondary Treatment") + ylab("Relative Size") + ylim(0.5,1.5) + theme_bw() + #Set the background color theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle axis.line = element_line(color = 'black'), #Set the axes color panel.border = element_blank(), #Set the border panel.grid.major = element_blank(), #Set the major gridlines panel.grid.minor = element_blank(), #Set the minor gridlines plot.background=element_blank(), #Set the plot background legend.position='none') + #remove legend background ggtitle("D) Day 158") + theme(plot.title = element_text(face = 'bold', size = 12, hjust = 0)) Fig.Exp2.D23.size ##### CAPTURE STATISTICAL OUTPUT AND FIGURES TO FILE ##### #Capture Figures to File Figure1.Size <- arrangeGrob(Fig.Exp1.size,Fig.CG.size, Fig.Exp2.D10.size, Fig.Exp2.D23.size, ncol=4) ggsave(file="RAnalysis/Figs/Fig3_Geoduck_Size.pdf", Figure1.Size, width = 10, height = 4, units = c("in")) ggsave(file="RAnalysis/Figs/Fig3_Geoduck_Size.png", Figure1.Size, width = 10, height = 4, units = c("in")) #Table S2 Phenotype ANOVA Results # Title cat("Table_S2_ANOVA_ShellArea\n\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt") # add 2 newlines cat("A) ANOVA results of shell area from initial exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) capture.output(Init.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) cat("\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) cat("B) ANOVA results of shell area from common garden exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) # add 2 newlines capture.output(CG.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) cat("\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) cat("C) ANOVA results of shell area from secondary exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) # add 2 newlines capture.output(Rexp.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE) ``` Methylation Analysis ## Loading genomic and annotation ```{r} #load sample information sample.info <- read.csv("RAnalysis/Data/Sample.Info.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info samp <- sample.info$Sample.ID # set sample info samp <- gsub("[_]", "-", samp) #remove extra characters #load genes gff (see OSF repository: https://osf.io/bcxk7/) Genes <- read.csv("RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3", header=F, sep="\t", na.string="NA", skip=3, stringsAsFactors = F) #read in data file Genes <- Genes[,c(1,4,5,9)] #select desired columns only colnames(Genes) <- c("scaffold", "start", "stop", "gene") #rename columns Genes$gene <- gsub(";.*","",Genes$gene) #remove extra characters Genes$gene <- gsub("ID=","",Genes$gene) #remove extra characters #Load annotation file (see OSF repository) Annot <- read.csv("RAnalysis/Data/Genome/Panopea-generosa-genes-annotations.tab", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE, skip=0) colnames(Annot) <- c("gene","scaffold", "start", "end", "uniprot.id", "uniprot_taxa.id", "Uniprot.Name", "GO.IDs", "GO.Descript") GO.data <- Annot[,c(1,8)] #select names and GOs splitted <- strsplit(as.character(GO.data$GO.IDs), "; ") #split into multiple GO ids Gene.GO.IDs <- data.frame(v1 = rep.int(GO.data$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row colnames(Gene.GO.IDs) <- c("gene", "GO.IDs") #rename columns Gene.GO.IDs$GO.IDs <- as.character(Gene.GO.IDs$GO.IDs) #Gene.GO.IDs$GO.IDs[is.na(Gene.GO.IDs$GO.IDs)] <- "unknown" #replace NA with unknown GO.IDs <-unique(Gene.GO.IDs$GO.IDs) # downloaded 20210407: http://www.informatics.jax.org/gotools/data/input/map2MGIslim.txt GoSlims <- read.csv("RAnalysis/Data/20210407_map2MGIslim.csv", header=TRUE, sep=",", na.string="NA", stringsAsFactors = FALSE, skip=0) colnames(GoSlims) <- c("GO.IDs", "Cat","GO.Slim.Term") Gene.GO.IDs.slims <- merge(Gene.GO.IDs, GoSlims, by="GO.IDs", all = TRUE) Gene.GO.IDs.slims <- Gene.GO.IDs.slims[-1,] GO.IDs.slims <- Gene.GO.IDs.slims[,-2] ``` ## Time dataset ```{bash} #Time dataset #mkdir RAnalysis/Data/OSF/Time multiIntersectBed -i \ RAnalysis/Data/OSF/Time/EPI-41_S38_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-42_S39_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-43_S40_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-44_S41_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-119_S31_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-120_S32_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-135_S35_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-136_S36_L005_5x.tab \ RAnalysis/Data/OSF/Time/EPI-151_S2_L002_5x.tab \ RAnalysis/Data/OSF/Time/EPI-152_S3_L002_5x.tab \ RAnalysis/Data/OSF/Time/EPI-153_S4_L002_5x.tab \ RAnalysis/Data/OSF/Time/EPI-154_S5_L002_5x.tab \ RAnalysis/Data/OSF/Time/EPI-181_S16_L003_5x.tab \ RAnalysis/Data/OSF/Time/EPI-182_S17_L003_5x.tab \ RAnalysis/Data/OSF/Time/EPI-184_S18_L003_5x.tab \ RAnalysis/Data/OSF/Time/EPI-185_S19_L003_5x.tab \ > RAnalysis/Data/OSF/Time/time.5x.bed cat RAnalysis/Data/OSF/Time/time.5x.bed | awk '$4 ==16' > RAnalysis/Data/OSF/Time/time.filtered.16.5x.bed #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file that ends in posOnly #b: annotated gene list #Save output in a new file that has the same base name and ends in -Annotated.txt for f in RAnalysis/Data/OSF/Time/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \ > ${f}_gene done #intersect with file to subset only those positions found in all samples for f in RAnalysis/Data/OSF/Time/*_gene do intersectBed \ -a ${f} \ -b RAnalysis/Data/OSF/Time/time.filtered.16.5x.bed \ > ${f}_CpG_time.bed done wc -l RAnalysis/Data/OSF/Time/*_CpG_time.bed ``` ## Day10 dataset ```{bash} # Day 10 Data Set #mkdir RAnalysis/Data/OSF/D10 multiIntersectBed -i \ RAnalysis/Data/OSF/D10/EPI-103_S27_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-104_S28_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-111_S29_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-113_S30_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-119_S31_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-120_S32_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-127_S33_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-128_S34_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-135_S35_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-136_S36_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-143_S37_L005_5x.tab \ RAnalysis/Data/OSF/D10/EPI-145_S38_L005_5x.tab \ > RAnalysis/Data/OSF/D10/D10.5x.bed cat RAnalysis/Data/OSF/D10/D10.5x.bed | awk '$4 ==12' > RAnalysis/Data/OSF/D10/D10.filtered.12.5x.bed #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file #b: annotated gene list #Save output in a new file that has the same base name for f in RAnalysis/Data/OSF/D10/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \ > ${f}_gene done #intersect with file to subset only those positions found in all samples for f in RAnalysis/Data/OSF/D10/*_gene do intersectBed \ -a ${f} \ -b RAnalysis/Data/OSF/D10/D10.filtered.12.5x.bed \ > ${f}_CpG_D10.bed done wc -l RAnalysis/Data/OSF/D10/*_CpG_D10.bed ``` ## Day135 dataset ```{bash} # Day 135 Data #mkdir RAnalysis/Data/OSF/D135 multiIntersectBed -i \ RAnalysis/Data/OSF/D135/EPI-151_S2_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-152_S3_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-153_S4_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-154_S5_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-159_S6_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-160_S7_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-161_S8_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-162_S9_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-167_S10_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-168_S11_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-169_S12_L002_5x.tab \ RAnalysis/Data/OSF/D135/EPI-170_S13_L002_5x.tab \ > RAnalysis/Data/OSF/D135/D135.5x.bed cat RAnalysis/Data/OSF/D135/D135.5x.bed | awk '$4 ==12' > RAnalysis/Data/OSF/D135/D135.filtered.12.5x.bed #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file #b: annotated gene list #Save output in a new file that has the same base name for f in RAnalysis/Data/OSF/D135/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \ > ${f}_gene done #intersect with file to subset only those positions found in all samples for f in RAnalysis/Data/OSF/D135/*_gene do intersectBed \ -a ${f} \ -b RAnalysis/Data/OSF/D135/D135.filtered.12.5x.bed \ > ${f}_CpG_D135.bed done wc -l RAnalysis/Data/OSF/D135/*_CpG_D135.bed ``` ## Day145 dataset ```{bash} # Day 145 Data #mkdir RAnalysis/Data/OSF/D145 multiIntersectBed -i \ RAnalysis/Data/OSF/D145/EPI-175_S14_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-176_S15_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-181_S16_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-182_S17_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-184_S18_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-185_S19_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-187_S20_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-188_S21_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-193_S22_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-194_S23_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-199_S24_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-200_S25_L003_5x.tab \ RAnalysis/Data/OSF/D145/EPI-205_S26_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-206_S27_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-208_S28_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-209_S29_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-214_S30_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-215_S31_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-220_S32_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-221_S33_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-226_S34_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-227_S35_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-229_S36_L004_5x.tab \ RAnalysis/Data/OSF/D145/EPI-230_S37_L004_5x.tab \ > RAnalysis/Data/OSF/D145/D145.5x.bed cat RAnalysis/Data/OSF/D145/D145.5x.bed | awk '$4 ==24' > RAnalysis/Data/OSF/D145/D145.filtered.24.5x.bed #Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes #wb: Print all lines in the second file #a: file #b: annotated gene list #Save output in a new file that has the same base name for f in RAnalysis/Data/OSF/D145/*5x.tab do intersectBed \ -wb \ -a ${f} \ -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \ > ${f}_gene done #intersect with file to subset only those positions found in all samples for f in RAnalysis/Data/OSF/D145/*_gene do intersectBed \ -a ${f} \ -b RAnalysis/Data/OSF/D145/D145.filtered.24.5x.bed \ > ${f}_CpG_D145.bed done wc -l RAnalysis/Data/OSF/D145/*_CpG_D145.bed ``` Load filtered methylation counts (see OSF Repository for all data files) ## Time dataset ```{r} # Time meth.data.T <- list.files(path = "RAnalysis/Data/OSF/Time", pattern = "time.bed$", full.names=TRUE) %>% set_names(.) %>% map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% dplyr::select(-c(V3,V7:V14)) %>% group_by(Sample.ID) colnames(meth.data.T) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene") meth.data.T$gene <- gsub(";.*","",meth.data.T$gene) #remove extra characters meth.data.T$gene <- gsub("ID=","",meth.data.T$gene) #remove extra characters meth.data.T$Sample.ID <- gsub("RAnalysis/Data/OSF/Time/","",meth.data.T$Sample.ID) #remove extra characters meth.data.T$Sample.ID <- gsub("_.*","",meth.data.T$Sample.ID) #remove extra characters meth.data.T$Sample.ID <- gsub("-","_",meth.data.T$Sample.ID) #remove extra characters MD.Time <- merge(meth.data.T,sample.info, by="Sample.ID") ``` ## Day10 dataset ```{r} #Day 10 meth.data.D10 <- list.files(path = "RAnalysis/Data/OSF/D10", pattern = "D10.bed$", full.names=TRUE) %>% set_names(.) %>% map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% dplyr::select(-c(V3,V7:V14)) %>% group_by(Sample.ID) colnames(meth.data.D10) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene") meth.data.D10$gene <- gsub(";.*","",meth.data.D10$gene) #remove extra characters meth.data.D10$gene <- gsub("ID=","",meth.data.D10$gene) #remove extra characters meth.data.D10$Sample.ID <- gsub("RAnalysis/Data/OSF/D10/","",meth.data.D10$Sample.ID) #remove extra characters meth.data.D10$Sample.ID <- gsub("_.*","",meth.data.D10$Sample.ID) #remove extra characters meth.data.D10$Sample.ID <- gsub("-","_",meth.data.D10$Sample.ID) #remove extra characters MD.D10 <- merge(meth.data.D10, sample.info, by="Sample.ID") ``` ## Day135 dataset ```{r} #Day 135 meth.data.D135 <- list.files(path = "RAnalysis/Data/OSF/D135", pattern = "D135.bed$", full.names=TRUE) %>% set_names(.) %>% map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% dplyr::select(-c(V3,V7:V14)) %>% group_by(Sample.ID) colnames(meth.data.D135) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene") meth.data.D135$gene <- gsub(";.*","",meth.data.D135$gene) #remove extra characters meth.data.D135$gene <- gsub("ID=","",meth.data.D135$gene) #remove extra characters meth.data.D135$Sample.ID <- gsub("RAnalysis/Data/OSF/D135/","",meth.data.D135$Sample.ID) #remove extra characters meth.data.D135$Sample.ID <- gsub("_.*","",meth.data.D135$Sample.ID) #remove extra characters meth.data.D135$Sample.ID <- gsub("-","_",meth.data.D135$Sample.ID) #remove extra characters MD.D135 <- merge(meth.data.D135, sample.info, by="Sample.ID") ``` ## Day145 dataset ```{r} #Day 145 meth.data.D145 <- list.files(path = "RAnalysis/Data/OSF/D145", pattern = "D145.bed$", full.names=TRUE) %>% set_names(.) %>% map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% dplyr::select(-c(V3,V7:V14)) %>% group_by(Sample.ID) colnames(meth.data.D145) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene") meth.data.D145$gene <- gsub(";.*","", meth.data.D145$gene) #remove extra characters meth.data.D145$gene <- gsub("ID=","",meth.data.D145$gene) #remove extra characters meth.data.D145$Sample.ID <- gsub("RAnalysis/Data/OSF/D145/","",meth.data.D145$Sample.ID) #remove extra characters meth.data.D145$Sample.ID <- gsub("_.*","",meth.data.D145$Sample.ID) #remove extra characters meth.data.D145$Sample.ID <- gsub("-","_",meth.data.D145$Sample.ID) #remove extra characters MD.D145 <- merge(meth.data.D145, sample.info, by="Sample.ID") ``` GO identification for background set ```{r} #generate list of all genes identified and background list for GO enrichment analysis MD.ALL <- rbind(MD.Time, MD.D10, MD.D135, MD.D145) MD.ALL.Genes <- unique(MD.ALL$gene) write.csv(MD.ALL.Genes, file = "RAnalysis/Output/MD.ALL.Genes_For_GO_Background.csv") ``` # Testing for Differentially Methylated Genes ## Time ```{r} # Comparison of Ambient samples through Time # Binomial GLM to test for differentially methylated genes sub_meth_table <- MD.Time sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene) #filter for genes with >5 methylated positions min.filt <- count(sub_meth_table, vars = c( group)) newdata <- min.filt[ which(min.filt$n > 4), ] sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,] # create data frame to stored results results <- data.frame() gs <- unique(sub_meth_table$gene) #first subset the unique dataframes and second run the GLMs for(i in 1:length(sub_meth_table$gene)){ #subset the dataframe gene by gene sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i]) # fit glm position model fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ TimePoint, data=sub_meth_table1, family=binomial) a <- anova(fit, test="Chisq") # capture summary stats to data frame df <- data.frame(gene = sub_meth_table1[1,7], pval.treatment = a$`Pr(>Chi)`[2], #pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene #pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment stringsAsFactors = F) # bind rows of temporary data frame to the results data frame results <- rbind(results, df) } # An error will be generated here for contrasts. #This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene #Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels #Continuing the analysis from results line will generate the results in the absence of the contrast (interaction). results[is.na(results)] <- 0 results$adj.pval.TimePoint <- p.adjust(results$pval.treatment, method='BH') #results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene #results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment Time.Amb.sig <-results Time.Amb.sig <- Time.Amb.sig[,c(1,3)] Time.Amb.sig <- Time.Amb.sig[order(Time.Amb.sig$adj.pval.TimePoint),] Time.Amb.sig <- Time.Amb.sig[which(Time.Amb.sig$adj.pval.TimePoint<0.05), ] # Annotation of DMG under ambient conditions through time Time.Amb.annot <- merge(Time.Amb.sig , Annot, by="gene", all.x=TRUE) Time.Amb.annot <- Time.Amb.annot[order(Time.Amb.annot$adj.pval.TimePoint),] Time.Amb.annot <- Time.Amb.annot[!duplicated(Time.Amb.annot$gene),] write.table(Time.Amb.annot, 'RAnalysis/Output/Supplemental_Tables/TableS10_Time_sig_annot.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs by time high <- MD.Time %>% filter(gene== Time.Amb.sig$gene[1]) means <- aggregate(per.meth ~ TimePoint, data=high, FUN=mean) ses <- aggregate(per.meth ~ TimePoint, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(means, aes(x=TimePoint, y=per.meth)) + geom_bar(stat="identity", color="black", position=position_dodge()) + geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2, position=position_dodge(.9)) + xlab("TimePoint") + #plot x axis label ylab("Percent Methylation") + #plot y axis label ylim(0,100)+ #set y limits to 0 and 100% scale_fill_manual(values = c("blue", "purple", "red"))+ theme_bw() ``` ## Time GO enrichment ```{r} # GO Enrichment Analysis of Ambient samples through Time ##### GO enrichment of DMGs ##### ALL<-as.data.frame(MD.ALL.Genes) #set the all gene list colnames(ALL) <-c("gene") #name the column for merging DMG <- as.character(Time.Amb.sig$gene) #set the enrichment test list L <- left_join(ALL, Genes, by="gene") #generate at dataframe to calculate lengths L$length <- L$stop-L$start # calculated lengths from start and stop GOAL <- merge(ALL, Gene.GO.IDs.slims,by="gene", all=T) #merge the all gene list with GO ids GOAL$GO.Slim.Term[is.na(GOAL$GO.Slim.Term)] <- "unknown" #list NA as unknown splitted <- strsplit(as.character(GOAL$GO.IDs), "; ") #slit into multiple GO ids to get them all in a single row GO.terms <- data.frame(v1 = rep.int(GOAL$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row GO.slimmed.terms <-GOAL[,c(1,4)] #subset slims only #change to vectors ALL.vector <-c(t(L$gene)) DMG.vector <-c(t(DMG)) ID.vector <- L$gene LENGTH.vector <-L$length gene.vector=as.integer(ALL.vector%in%DMG.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) GO.Time <- GO.wall[order(GO.wall$over_represented_pvalue),] write.csv(GO.Time , file = "RAnalysis/Output/GO.Time.csv") colnames(GO.Time)[1] <- "GO.IDs" GO.Time <- left_join(GO.Time, GO.IDs.slims, by="GO.IDs") GO.Time <- GO.Time[!duplicated(GO.Time$GO.IDs), ] sig.GO.Time <- GO.Time %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) write.csv(sig.GO.Time , file = "RAnalysis/Output/sig.GO.Time.csv") #save GO enrichment table with gene IDs and protein names for (i in 1:nrow(sig.GO.Time)){ sig.GO.Time$gene[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"gene"]), collapse = "; ") sig.GO.Time$uniprot[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"uniprot.id"]), collapse = "; ") sig.GO.Time$uniprot_taxa.id[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ") sig.GO.Time$uniprot.name[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ") } write.csv(sig.GO.Time , file = "RAnalysis/Output/Supplemental_Tables/TableS11_sig.GO.Time.csv") sig.GO.Time.BP <- GO.Time %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% filter(ontology == "BP") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.Time.BP$term <- factor(sig.GO.Time.BP$term, levels = sig.GO.Time.BP$term[nrow(sig.GO.Time.BP):1]) #plot significantly enriched GO terms by Slim Category TimeGO.BP <- ggplot(sig.GO.Time.BP, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) sig.GO.Time.MF <- GO.Time %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >1) %>% filter(ontology == "MF") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.Time.MF$term <- factor(sig.GO.Time.MF$term, levels = sig.GO.Time.MF$term[nrow(sig.GO.Time.MF):1]) #plot significantly enriched GO terms by Slim Category TimeGO.MF <- ggplot(sig.GO.Time.MF, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) Time.GO.Sig <- ggarrange(TimeGO.BP,TimeGO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/TimeGO.pdf", Time.GO.Sig, width = 15, height = 12, units = c("in")) ``` ## Day10 ```{r} # Comparison of Treatments at Day 10 # Binomial GLM to test for differentially methylated genes meth_table <- MD.D10 sub_meth_table <- meth_table[meth_table$Initial.Treatment == 'Ambient' | meth_table$Initial.Treatment == 'Low' | meth_table$Initial.Treatment == 'Super.Low', ] sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene) #filter for genes with >5 methylated positions min.filt <- count(sub_meth_table, vars = c( group)) newdata <- min.filt[ which(min.filt$n > 4), ] sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,] # create data frame to stored results results <- data.frame() gs <- unique(sub_meth_table$gene) #first subset the unique dataframes and second run the GLMs for(i in 1:length(sub_meth_table$gene)){ #subset the dataframe gene by gene sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i]) # fit glm position model fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment, data=sub_meth_table1, family=binomial, maxit = 100) a <- anova(fit, test="Chisq") # capture summary stats to data frame df <- data.frame(gene = sub_meth_table1[1,7], pval.treatment = a$`Pr(>Chi)`[2], #pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene #pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment stringsAsFactors = F) # bind rows of temporary data frame to the results data frame results <- rbind(results, df) } # An error will be generated here for contrasts. #This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene #Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels #Continuing the analysis from results line will generate the results in the absence of the contrast (interaction). results[is.na(results)] <- 0 results$adj.pval.Initial.Treatment <- p.adjust(results$pval.treatment, method='BH') #results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene #results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment D10.sig <-results D10.sig <- D10.sig[,c(1,3)] D10.sig <- D10.sig[order(D10.sig$adj.pval.Initial.Treatment),] D10.sig <- D10.sig[which(D10.sig$adj.pval.Initial.Treatment<0.05), ] write.table(D10.sig, 'RAnalysis/Output/D10_sig.tsv', sep='\t', row.names=FALSE) # Annotation of D10 DMG D10.sig.annot <- merge(D10.sig, Annot, by="gene", all.x=TRUE) D10.sig.annot <- D10.sig.annot[order(D10.sig.annot$adj.pval.Initial.Treatment),] D10.sig.annot <- D10.sig.annot[!duplicated(D10.sig.annot$gene),] write.table(D10.sig.annot, 'RAnalysis/Output/Supplemental_Tables/TableS12_D10_sig_annot.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs high <- MD.D10 %>% filter(gene== D10.sig$gene[1]) means <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=mean) ses <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(means, aes(x=Initial.Treatment, y=per.meth, fill=Initial.Treatment)) + geom_bar(stat="identity", color="black", position=position_dodge()) + geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2, position=position_dodge(.9)) + xlab("Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label ylim(0,100)+ #set y limits to 0 and 100% scale_fill_manual(values = c("blue", "purple", "red"))+ theme_bw() ``` ## Day10 GO enrichment ```{r} # GO Enrichment Analysis of Treatment at Day 10 ##### GO enrichment of DMGs ##### DMG.D10 <- as.character(D10.sig$gene) #set the enrichment test list DMG.D10.vector <-c(t(DMG.D10)) #change to vectors gene.vector=as.integer(ALL.vector%in%DMG.D10.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.D10.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, GO.D10.wall<-goseq(DEG.D10.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) GO.D10 <- GO.D10.wall[order(GO.D10.wall$over_represented_pvalue),] write.csv(GO.D10 , file = "RAnalysis/Output/GO.D10.csv") colnames(GO.D10)[1] <- "GO.IDs" GO.D10 <- left_join(GO.D10, GO.IDs.slims, by="GO.IDs") GO.D10 <- GO.D10[!duplicated(GO.D10$GO.IDs), ] sig.GO.D10 <- GO.D10 %>% filter(over_represented_pvalue <0.05) %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) write.csv(sig.GO.D10 , file = "RAnalysis/Output/sig.GO.D10.csv") #save GO enrichment table with gene IDs and protein names for (i in 1:nrow(sig.GO.D10)){ sig.GO.D10$gene[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"gene"]), collapse = "; ") sig.GO.D10$uniprot[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"uniprot.id"]), collapse = "; ") sig.GO.D10$uniprot_taxa.id[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ") sig.GO.D10$uniprot.name[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ") } write.csv(sig.GO.D10 , file = "RAnalysis/Output/Supplemental_Tables/TableS13_sig.GO.D10.csv") sig.GO.D10.BP <- GO.D10 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% filter(ontology == "BP") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D10.BP$term <- factor(sig.GO.D10.BP$term, levels = sig.GO.D10.BP$term[nrow(sig.GO.D10.BP):1]) #plot significantly enriched GO terms by Slim Category D10GO.BP <- ggplot(sig.GO.D10.BP, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) sig.GO.D10.MF <- GO.D10 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >1) %>% filter(ontology == "MF") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D10.MF$term <- factor(sig.GO.D10.MF$term, levels = sig.GO.D10.MF$term[nrow(sig.GO.D10.MF):1]) #plot significantly enriched GO terms by Slim Category D10GO.MF <- ggplot(sig.GO.D10.MF, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) D10.GO.Sig <- ggarrange(D10GO.BP,D10GO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/GO.D10.pdf", D10.GO.Sig, width = 15, height = 12, units = c("in")) ``` ```{r} # Comparison of Treatments at Day 135 # Binomial GLM to test for differentially methylated genes meth_table <- MD.D135 sub_meth_table <- meth_table[meth_table$Initial.Treatment == 'Ambient' | meth_table$Initial.Treatment == 'Low' | meth_table$Initial.Treatment == 'Super.Low', ] sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene) #filter for genes with >5 methylated positions min.filt <- count(sub_meth_table, vars = c( group)) newdata <- min.filt[ which(min.filt$n > 4), ] sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,] # create data frame to store results results <- data.frame() gs <- unique(sub_meth_table$gene) #first subset the unique dataframes and second run the GLMs for(i in 1:length(sub_meth_table$gene)){ #subset the dataframe gene by gene sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i]) #if(length(unique(sub_meth_table1$position)) >1){ # fit glm position model fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment, data=sub_meth_table1, family=binomial) #} else { # # fit glm model # fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment * position, # data=sub_meth_table1, family=binomial) #} #s <- step(fit, trace=0) a <- anova(fit, test="Chisq") a # capture summary stats to data frame df <- data.frame(gene = sub_meth_table1[1,7], pval.treatment = a$`Pr(>Chi)`[2], #pval.position = a$`Pr(>Chi)`[3], #pval.treatment_x_position = a$`Pr(>Chi)`[4], stringsAsFactors = F) # bind rows of temporary data frame to the results data frame results <- rbind(results, df) } # An error will be generated here for contrasts. #This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene #Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels #Continuing the analysis from results line will generate the results in the absence of the contrast (interaction). results[is.na(results)] <- 0 results$adj.pval.treatment <- p.adjust(results$pval.treatment, method='BH') #results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') D135.sig <-results D135.sig <- D135.sig[,c(1,3)] D135.sig <- D135.sig[order(D135.sig$adj.pval.treatment),] D135.sig <- D135.sig[which(D135.sig$adj.pval.treatment<0.05), ] write.table(D135.sig, 'RAnalysis/Output/D135_sig.tsv', sep='\t', row.names=FALSE) # Annotation of D135 DMG D135.sig.annot <- merge(D135.sig, Annot, by="gene", all.x=TRUE) D135.sig.annot <- D135.sig.annot[order(D135.sig.annot$adj.pval.treatment),] D135.sig.annot <- D135.sig.annot[!duplicated(D135.sig.annot$gene),] write.table(D135.sig.annot, 'RAnalysis/Output/Supplemental_Tables/TableS14_D135_sig_annot.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs high <- MD.D135 %>% filter(gene== D135.sig$gene[1]) means <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=mean) ses <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(means, aes(x=Initial.Treatment, y=per.meth, fill=Initial.Treatment)) + geom_bar(stat="identity", color="black", position=position_dodge()) + geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2, position=position_dodge(.9)) + xlab("Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label ylim(0,100)+ #set y limits to 0 and 100% scale_fill_manual(values = c("blue", "purple", "red"))+ theme_bw() ``` ```{r} # GO Enrichment Analysis of Treatment at Day 135 ##### GO enrichment of DMGs ##### DMG.D135 <- as.character(D135.sig$gene) #set the enrichment test list DMG.D135.vector <-c(t(DMG.D135)) #change to vectors gene.vector=as.integer(ALL.vector%in%DMG.D135.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.D135.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, GO.D135.wall<-goseq(DEG.D135.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) GO.D135 <- GO.D135.wall[order(GO.D135.wall$over_represented_pvalue),] write.csv(GO.D135 , file = "RAnalysis/Output/GO.D135.csv") colnames(GO.D135)[1] <- "GO.IDs" GO.D135 <- left_join(GO.D135, GO.IDs.slims, by="GO.IDs") GO.D135 <- GO.D135[!duplicated(GO.D135$GO.IDs), ] sig.GO.D135 <- GO.D135 %>% filter(over_represented_pvalue <0.05) %>% #filter(numInCat >5) %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) write.csv(sig.GO.D135 , file = "RAnalysis/Output/sig.GO.D135.csv") #save GO enrichment table with gene IDs and protein names #for D135 for (i in 1:nrow(sig.GO.D135)){ sig.GO.D135$gene[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"gene"]), collapse = "; ") sig.GO.D135$uniprot[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"uniprot.id"]), collapse = "; ") sig.GO.D135$uniprot_taxa.id[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ") sig.GO.D135$uniprot.name[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ") } write.csv(sig.GO.D135 , file = "RAnalysis/Output/Supplemental_Tables/TableS15_sig.GO.D135.csv") sig.GO.D135.BP <- GO.D135 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% filter(ontology == "BP") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D135.BP$term <- factor(sig.GO.D135.BP$term, levels = sig.GO.D135.BP$term[nrow(sig.GO.D135.BP):1]) #plot significantly enriched GO terms by Slim Category D135GO.BP <- ggplot(sig.GO.D135.BP, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) sig.GO.D135.MF <- GO.D135 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >1) %>% filter(ontology == "MF") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D135.MF$term <- factor(sig.GO.D135.MF$term, levels = sig.GO.D135.MF$term[nrow(sig.GO.D135.MF):1]) #plot significantly enriched GO terms by Slim Category D135GO.MF <- ggplot(sig.GO.D135.MF, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) D135.GO.Sig <- ggarrange(D135GO.BP,D135GO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/GO.D135.pdf", D135.GO.Sig, width = 15, height = 12, units = c("in")) ``` ```{r} # Comparison of Treatments at Day 145 # Binomial GLM to test for differentially methylated genes meth_table <- MD.D145 sub_meth_table <- meth_table[meth_table$Initial.Treatment == "Ambient" | meth_table$Initial.Treatment == "Low" | meth_table$Initial.Treatment == "Super.Low" | meth_table$Secondary.Treatment == "Ambient" | meth_table$Secondary.Treatment == "Low", ] sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene) #filter for genes with >5 methylated positions min.filt <- count(sub_meth_table, vars = c( group)) newdata <- min.filt[ which(min.filt$n > 4), ] sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,] # create data frame to store results results <- data.frame() gs <- unique(sub_meth_table$gene) #first subset the unique dataframes and second run the GLMs for(i in 1:length(sub_meth_table$gene)){ #subset the dataframe gene by gene sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i]) # fit glm position model fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment * Secondary.Treatment, data=sub_meth_table1, family=binomial) a <- anova(fit, test="LRT") # capture summary stats to data frame df <- data.frame(gene = sub_meth_table1[1,7], pval.treatment1 = a$`Pr(>Chi)`[2], pval.treatment2 = a$`Pr(>Chi)`[3], #position = a$`Pr(>Chi)`[2], pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4], #pval.treatment1_x_position = a$`Pr(>Chi)`[2], #pval.treatment2_x_position = a$`Pr(>Chi)`[2], #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2], stringsAsFactors = F) # bind rows of temporary data frame to the results data frame results <- rbind(results, df) } # An error will be generated here for contrasts. #This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene #Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels #Continuing the analysis from results line will generate the results in the absence of the contrast (interaction). results[is.na(results)] <- 0 results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH') results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH') results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH') #results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH') #results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH') #results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH') D145.sig <-results D145.sig <- D145.sig[,c(1,5:7)] # Identifying DMG with significant interaction D145.sig.int <- D145.sig[order(D145.sig$adj.pval.treatment1_x_treatment2),] D145.sig.int <- D145.sig.int[which(D145.sig$adj.pval.treatment1_x_treatment2<0.05), ] write.table(D145.sig, 'RAnalysis/Output/D145_sig.int.tsv', sep='\t', row.names=FALSE) # Annotation of Secondary Exposure Interaction DMG D145.sig.int.annot <- merge(D145.sig.int, Annot, by="gene", all.x=TRUE) D145.sig.int.annot <- D145.sig.int.annot[order(D145.sig.int.annot$adj.pval.treatment1_x_treatment2),] D145.sig.int.annot <- D145.sig.int.annot[!duplicated(D145.sig.int.annot$gene),] write.table(D145.sig.int.annot, 'RAnalysis/Output/Supplemental_Tables/TableS16_D145_sig_annot_Interactions.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs with sig interaction high <- MD.D145 %>% filter(gene== D145.sig$gene[1]) means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean) ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data geom_line(size=0.7, position=position_dodge(.1)) + #plot lines scale_colour_manual(values = c("blue", "purple", "red")) + #set line color geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics scale_shape_manual(values=c(1,18,16)) + #set shapes geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics xlab("Secondary Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label theme_bw() # Identifying DMG with significant main effect of Exposure 1 D145.sig.Exp1 <- D145.sig[which(D145.sig$adj.pval.treatment1<0.05), ] write.table(D145.sig.Exp1, 'RAnalysis/Output/D145_sig.Exp1.tsv', sep='\t', row.names=FALSE) # Annotation of Main Effect of Initial for 145 DMGs D145.sig.Exp1.annot <- merge(D145.sig.Exp1 , Annot, by="gene", all.x=TRUE) D145.sig.Exp1.annot <- D145.sig.Exp1.annot[!duplicated(D145.sig.Exp1.annot$gene),] write.table(D145.sig.Exp1.annot , 'RAnalysis/Output/D145_sig_annot_Exposure1.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs with sig Exposure 1 high <- MD.D145 %>% filter(gene== D145.sig.Exp1$gene[1]) means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean) ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data geom_line(size=0.7, position=position_dodge(.1)) + #plot lines scale_colour_manual(values = c("blue", "purple", "red")) + #set line color geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics scale_shape_manual(values=c(1,18,16)) + #set shapes geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics xlab("Secondary Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label theme_bw() # Identifying DMG with significant main effect of Exposure 2 D145.sig.Exp2 <- D145.sig[which(D145.sig$adj.pval.treatment2<0.05), ] write.table(D145.sig.Exp2, 'RAnalysis/Output/D145_sig.Exp2.tsv', sep='\t', row.names=FALSE) # Annotation of Main Effect of Secondary for 145 DMGs D145.sig.Exp2.annot <- merge(D145.sig.Exp2 , Annot, by="gene", all.x=TRUE) D145.sig.Exp2.annot <- D145.sig.Exp2.annot[!duplicated(D145.sig.Exp2.annot$gene),] #write.table(D145.sig.Exp2.annot , 'RAnalysis/Output/D145_sig_annot_Exposure2.tsv', sep='\t', row.names=FALSE) #Sanity check plotting of top DMGs with sig Exposure 1 high <- MD.D145 %>% filter(gene== D145.sig.Exp2$gene[66]) means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean) ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error) means$ses <- ses$per.meth ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data geom_line(size=0.7, position=position_dodge(.1)) + #plot lines scale_colour_manual(values = c("blue", "purple", "red")) + #set line color geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics scale_shape_manual(values=c(1,18,16)) + #set shapes geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics xlab("Secondary Treatment") + #plot x axis label ylab("Percent Methylation") + #plot y axis label theme_bw() ``` ```{r} # GO Enrichment Analysis of Treatment at Day 145 ##### GO enrichment of DMGs ##### DMG.D145 <- as.character(D145.sig.int$gene) #set the enrichment test list DMG.D145.vector <-c(t(DMG.D145)) #change to vectors gene.vector=as.integer(ALL.vector%in%DMG.D145.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.D145.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, GO.D145.wall<-goseq(DEG.D145.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) GO.D145 <- GO.D145.wall[order(GO.D145.wall$over_represented_pvalue),] write.csv(GO.D145 , file = "RAnalysis/Output/GO.D145.Interaction.csv") colnames(GO.D145)[1] <- "GO.IDs" GO.D145 <- left_join(GO.D145, GO.IDs.slims, by="GO.IDs") GO.D145 <- GO.D145[!duplicated(GO.D145$GO.IDs), ] sig.GO.D145 <- GO.D145 %>% filter(over_represented_pvalue <0.05) %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) write.csv(sig.GO.D145 , file = "RAnalysis/Output/sig.GO.D145.csv") #save GO enrichment table with gene IDs and protein names for (i in 1:nrow(sig.GO.D145)){ sig.GO.D145$gene[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"gene"]), collapse = "; ") sig.GO.D145$uniprot[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"uniprot.id"]), collapse = "; ") sig.GO.D145$uniprot_taxa.id[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ") sig.GO.D145$uniprot.name[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ") } write.csv(sig.GO.D145 , file = "RAnalysis/Output/Supplemental_Tables/TableS17_sig.GO.D145.Interactions.csv") sig.GO.D145.BP <- GO.D145 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% filter(ontology == "BP") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D145.BP$term <- factor(sig.GO.D145.BP$term, levels = sig.GO.D145.BP$term[nrow(sig.GO.D145.BP):1]) #plot significantly enriched GO terms by Slim Category D145GO.BP <- ggplot(sig.GO.D145.BP, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) sig.GO.D145.MF <- GO.D145 %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >1) %>% filter(ontology == "MF") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.D145.MF$term <- factor(sig.GO.D145.MF$term, levels = sig.GO.D145.MF$term[nrow(sig.GO.D145.MF):1]) #plot significantly enriched GO terms by Slim Category D145GO.MF <- ggplot(sig.GO.D145.MF, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) D145.GO.Sig <- ggarrange(D145GO.BP,D145GO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/GO.D145.pdf", D145.GO.Sig, width = 15, height = 12, units = c("in")) ``` ```{r} ##### Persistent DMGs ##### #Persistent from first exposure throughout common garden ints_10_135 <- merge(D10.sig, D135.sig, by="gene") ints_10_135.annot <- merge(ints_10_135, Annot, by="gene", all.x=TRUE) ints_10_135.annot <- ints_10_135.annot[!duplicated(ints_10_135.annot$gene),] ints_10_135.annot <- na.omit(ints_10_135.annot) #Persistent throughout all timepoints ints_all <- merge(ints_10_135, D145.sig, by="gene") ints_all.annot <- merge(ints_all, Annot, by="gene", all.x=TRUE) ints_all.annot <- ints_all.annot[!duplicated(ints_all.annot$gene),] write.table(ints_all.annot, 'RAnalysis/Output/Supplemental_Tables/TableS18_DMGs_persist.tsv', sep='\t', row.names=FALSE) ints_all.annot <- na.omit(ints_all.annot) ``` ```{r} # GO Enrichment Analysis of DMG present in all comparisons #### GO enrichment of DMGs ##### DMG.Persist <- as.character(ints_all.annot$gene) #set the enrichment test list #change contig lists to vectors DMG.vector <-c(t(DMG.Persist)) gene.vector=as.integer(ALL.vector%in%DMG.vector) #Construct new vector with 1 for DEG and 0 for others names(gene.vector)=ALL.vector #set names DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene #Find enriched GO terms, GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE) GO.Persist <- GO.wall[order(GO.wall$over_represented_pvalue),] write.csv(GO.Persist , file = "RAnalysis/Output/GO.Persist .csv") colnames(GO.Persist )[1] <- "GO.IDs" GO.Persist <- left_join(GO.Persist, GO.IDs.slims, by="GO.IDs") GO.Persist <- GO.Persist [!duplicated(GO.Persist$GO.IDs), ] sig.GO.Persist <- GO.Persist %>% filter(over_represented_pvalue <0.05) %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) write.csv(sig.GO.Persist , file = "RAnalysis/Output/sig.GO.Persist.csv") #save GO enrichment table with gene IDs and protein names for (i in 1:nrow(sig.GO.Persist)){ sig.GO.Persist$gene[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"gene"]), collapse = "; ") sig.GO.Persist$uniprot[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"uniprot.id"]), collapse = "; ") sig.GO.Persist$uniprot_taxa.id[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ") sig.GO.Persist$uniprot.name[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ") } write.csv(sig.GO.Persist , file = "RAnalysis/Output/Supplemental_Tables/TableS19_sig.GO.Persistent.annot.csv") sig.GO.Persist.BP <- GO.Persist %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >5) %>% filter(ontology == "BP") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.Persist.BP$term <- factor(sig.GO.Persist.BP$term, levels = sig.GO.Persist.BP$term[nrow(sig.GO.Persist.BP):1]) #plot significantly enriched GO terms by Slim Category PersistGO.BP <- ggplot(sig.GO.Persist.BP, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) sig.GO.Persist.MF <- GO.Persist %>% filter(over_represented_pvalue <0.05) %>% filter(numInCat >1) %>% filter(ontology == "MF") %>% arrange(., ontology, GO.Slim.Term, over_represented_pvalue) sig.GO.Persist.MF$term <- factor(sig.GO.Persist.MF$term, levels = sig.GO.Persist.MF$term[nrow(sig.GO.D145.MF):1]) #plot significantly enriched GO terms by Slim Category PersistGO.MF <- ggplot(sig.GO.Persist.MF, aes(x = ontology, y = term)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 10), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/GO.Persist.pdf", Persist.GO.Sig, width = 15, height = 12, units = c("in")) ``` ```{r} ##### Heatmap of DMG in Ambient by time ##### Time.Amb.meth_table.means <- aggregate(per.meth ~ TimePoint*gene, data=MD.Time, FUN=mean) Time.Amb.meth_table.means <- Time.Amb.meth_table.means[Time.Amb.meth_table.means$gene %in% Time.Amb.sig$gene,] Time.Amb <- Time.Amb.meth_table.means %>% pivot_wider(names_from = c(TimePoint), values_from = per.meth) Time.Amb.mat <- as.matrix(Time.Amb[,-1]) #new code as of Feb 22, 2021 (S.A.T.) pdf("RAnalysis/Output/Time.DMG_Rel_HEATMAP_v2.pdf") heatmap.2(Time.Amb.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() { breaks = pretty(parent.frame()$breaks) #breaks = breaks[c(1,length(breaks))] list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=FALSE, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE, lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4)) dev.off() ``` ```{r} ##### Heatmap of DMG in D10 ##### D10.meth_table.means <- aggregate(per.meth ~ Initial.Treatment*gene, data=MD.D10, FUN=mean) D10.meth_table.means <- D10.meth_table.means[D10.meth_table.means$gene %in% D10.sig$gene,] D10 <- D10.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment), values_from = per.meth) D10.mat <- as.matrix(D10[,-1]) col.order <- c( "Ambient", "Low", "Super.Low") D10.mat <- D10.mat[,col.order] #new code as of Feb 22, 2021 (S.A.T.) colnames(D10.mat) <- c("pH 7.9", "pH 7.4", "pH 7.0") pdf("RAnalysis/Output/D10.DMG_Rel_HEATMAP_v2.pdf") heatmap.2(D10.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() { breaks = pretty(parent.frame()$breaks) #breaks = breaks[c(1,length(breaks))] list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=NA, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4)) dev.off() ``` ```{r} ##### Heatmap of DMG in D135 ##### D135.meth_table.means <- aggregate(per.meth ~ Initial.Treatment*gene, data=MD.D135, FUN=mean) D135.meth_table.means <- D135.meth_table.means[D135.meth_table.means$gene %in% D135.sig$gene,] D135 <- D135.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment), values_from = per.meth) D135.mat <- as.matrix(D135[,-1]) col.order <- c( "Ambient", "Low", "Super.Low") D135.mat <- D135.mat[,col.order] #new code as of Feb 22, 2021 (S.A.T.) colnames(D135.mat) <- c("pH 7.9", "pH 7.4", "pH 7.0") pdf("RAnalysis/Output/D135.DMG_Rel_HEATMAP_v2.pdf") heatmap.2(D135.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() { breaks = pretty(parent.frame()$breaks) #breaks = breaks[c(1,length(breaks))] list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=NA, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4)) dev.off() ``` ```{r} ##### Heatmap of DMG in D145 ##### D145.meth_table.means <- aggregate(per.meth ~ gene*Initial.Treatment*Secondary.Treatment, data=MD.D145, FUN=mean) D145.meth_table.means <- D145.meth_table.means[D145.meth_table.means$gene %in% D145.sig$gene,] D145 <- D145.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment, Secondary.Treatment), values_from = per.meth) D145.mat <- as.matrix(D145[,-1]) col.order <- c("Ambient_Ambient", "Ambient_Low", "Low_Ambient", "Low_Low", "Super.Low_Ambient", "Super.Low_Low") D145.mat <- D145.mat[,col.order] #new code as of Feb 22, 2021 (S.A.T.) colnames(D145.mat) <- c("pH 7.9 -> pH 7.9", "pH 7.9 -> pH 7.4","pH 7.4 -> pH 7.9","pH 7.4 -> pH 7.4", "pH 7.0 -> pH 7.9", "pH 7.0 -> pH 7.4") pdf("RAnalysis/Output/D145.DMG_Rel_HEATMAP_v2.pdf") heatmap.2(D145.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() { breaks = pretty(parent.frame()$breaks) #breaks = breaks[c(1,length(breaks))] list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=FALSE, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,sepcolor="white",sepwidth=c(0.1,0.01),colsep=c(0,2,4,6), lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4)) dev.off() ``` ```{r} ### Magnitude of change in secondary exposure D145.mat.1 <- D145.mat[,c(1:2)] D145.mat.1 <- as.data.frame(D145.mat.1) D145.mat.1$gene <-rownames(D145.mat.1) D145.mat.1$diff <- D145.mat.1$`pH 7.9 -> pH 7.9` - D145.mat.1$`pH 7.9 -> pH 7.4` A <- round(mean(D145.mat.1$diff),2) std.error(D145.mat.1$diff) var(D145.mat.1$diff) D145.mat.2 <- D145.mat[,c(3:4)] D145.mat.2 <- as.data.frame(D145.mat.2) D145.mat.2$gene <-rownames(D145.mat.2) D145.mat.2$diff <- D145.mat.2$`pH 7.4 -> pH 7.9` - D145.mat.2$`pH 7.4 -> pH 7.4` M <-round(mean(D145.mat.2$diff),2) std.error(D145.mat.2$diff) var(D145.mat.2$diff) D145.mat.3 <- D145.mat[,c(5:6)] D145.mat.3 <- as.data.frame(D145.mat.3) D145.mat.3$gene <-rownames(D145.mat.3) D145.mat.3$diff <- D145.mat.3$`pH 7.0 -> pH 7.9` - D145.mat.3$`pH 7.0 -> pH 7.4` L <-round(mean(D145.mat.3$diff), 2) std.error(D145.mat.3$diff) var(D145.mat.3$diff) labs1 <- c(A, M, L) A L M ``` Plotting Percent methylation for Persistent DMGs ```{r} ##### Heatmap ##### #D10 Persistant.meth_table.means.D10 <- MD.ALL %>% filter(TimePoint=="Day10") %>% group_by(gene, Initial.Treatment) %>% summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth))) Persistant.meth_table.means.D10 <- Persistant.meth_table.means.D10 [Persistant.meth_table.means.D10 $gene %in% ints_all$gene,] Pers.D10.HM <- Persistant.meth_table.means.D10 %>% ggplot(., aes(x = Initial.Treatment, y = gene, fill =avg)) + geom_tile() + scale_fill_viridis(discrete=FALSE)+ #scale_fill_gradientn(colours = c("black", "green","cyan", "red", "orange", "yellow"), values = c(0,20, 40, 60, 80,100))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 5), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) #Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/Persist.D10.HM.pdf", Pers.D10.HM, width = 5, height = 30, units = c("in")) #D135 Persistant.meth_table.means.D135 <- MD.ALL %>% filter(TimePoint=="Day135") %>% group_by(gene, Initial.Treatment) %>% summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth))) Persistant.meth_table.means.D135 <- Persistant.meth_table.means.D135[Persistant.meth_table.means.D135$gene %in% ints_all$gene,] Pers.D135.HM <- Persistant.meth_table.means.D135 %>% ggplot(., aes(x = Initial.Treatment, y = gene, fill =avg)) + geom_tile() + scale_fill_viridis(discrete=FALSE)+ #scale_fill_gradientn(colours = c("black", "green","cyan", "red", "orange", "yellow"), values = c(0,20, 40, 60, 80,100))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 5), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) #Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/Persist.D135.HM.pdf", Pers.D135.HM, width = 5, height = 30, units = c("in")) #D145 Persistant.meth_table.means.D145 <- MD.ALL %>% filter(TimePoint=="Day145") %>% group_by(gene, Initial.Treatment, Secondary.Treatment) %>% summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth))) Persistant.meth_table.means.D145 <- Persistant.meth_table.means.D145[Persistant.meth_table.means.D145$gene %in% ints_all$gene,] Persistant.meth_table.means.D145$group <- paste0(Persistant.meth_table.means.D145$Initial.Treatment, "_", Persistant.meth_table.means.D145$Secondary.Treatment) Pers.D145.HM <- Persistant.meth_table.means.D145 %>% ggplot(., aes(x = group, y = gene, fill =avg)) + geom_tile() + scale_fill_viridis(discrete=FALSE)+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 5), strip.text.x = element_text(size = 20), axis.text = element_text(size = 8), axis.title.x = element_blank(), axis.title.y = element_blank()) Persist.DMGs <- ggarrange(Pers.D10.HM,Pers.D135.HM, Pers.D145.HM, ncol=3, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/Persist.alltimes.HM.pdf", Persist.DMGs, width = 15, height = 30, units = c("in")) ``` ```{r} #obtain genes from sig.GO.D10, sig.GO.D135, sig.GO.D145 #plot geom_tile heatmap as above with only those genes faceted by goslim term #add column to sig.GO dataframes to denote sampling time point sig.GO.D10$TP <- "Day10" sig.GO.D135$TP <- "Day135" sig.GO.D145$TP <- "Day145" #bind data frames together byGO <- as.data.frame(rbind(sig.GO.D10, sig.GO.D135, sig.GO.D145)) #create list objects where each list item is a vector containing genes, uniprot ids, or taxa ids for each line split_byGO_gene <- strsplit(as.character(byGO$gene), "; ") #split into multiple gene ids split_byGO_uniprot <- strsplit(as.character(byGO$uniprot), "; ") #split into multiple uniprot ids split_byGO_taxa <- strsplit(as.character(byGO$uniprot_taxa.id), "; ") #split into multiple taxa ids #create a new data frame that lists each gene, uniprot ID, and taxa id on a separate line (so the GO IDs are listed multiple times for each gene). #For the first data.frame Gene.byGO.IDs, I'm adding all the data from byGO and this will be maintained when it is merged with the uniprot info Gene.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_gene, length)), over_represented_pvalue = rep.int(byGO$over_represented_pvalue, sapply(split_byGO_gene, length)), under_represented_pvalue = rep.int(byGO$under_represented_pvalue, sapply(split_byGO_gene, length)) , numDEInCat = rep.int(byGO$numDEInCat, sapply(split_byGO_gene, length)), numInCat = rep.int(byGO$numInCat, sapply(split_byGO_gene, length)), term = rep.int(byGO$term, sapply(split_byGO_gene, length)), ontology = rep.int(byGO$ontology, sapply(split_byGO_gene, length)), Cat = rep.int(byGO$Cat, sapply(split_byGO_gene, length)),GO.Slim.Term = rep.int(byGO$GO.Slim.Term, sapply(split_byGO_gene, length)),TP = rep.int(byGO$TP, sapply(split_byGO_gene, length)), gene = unlist(split_byGO_gene)) #create a dataframe containing the GO.IDs and uniprot IDs all listed on separate lines unip.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_uniprot, length)), uniprot = unlist(split_byGO_uniprot)) #create a dataframe containing the GO.IDs and uniprot taxa IDs all listed on separate lines taxa.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_taxa, length)), uniprot_taxa.id = unlist(split_byGO_taxa)) #merge the dataframes with everything listed on separate lines byGO_long <- cbind(Gene.byGO.IDs, "uniprot" =unip.byGO.IDs[,"uniprot"],uniprot_taxa.id =taxa.byGO.IDs[,"uniprot_taxa.id"]) #remove the species from the taxa id byGO_long$uniprot_taxa.id <- gsub("_.*", "",byGO_long$uniprot_taxa.id) #re-order the df by slim, timepoint, then pvalue byGO_long <- byGO_long[order(byGO_long$GO.Slim.Term,byGO_long$TP,byGO_long$over_represented_pvalue),] # paste gene uniprot id next to terms for (i in 1:nrow(byGO_long)){ byGO_long$taxa.id[i] <- paste(unique(unname(byGO_long[grep(byGO_long$GO.IDs[i],byGO_long$GO.IDs),"uniprot_taxa.id"])), collapse = "; ") } byGO_long_term_gene <- unique(byGO_long[,-grep("^gene$|uniprot", colnames(byGO_long))]) byGO_long_term_gene$term_gene <- paste0(byGO_long_term_gene$term, " (",byGO_long_term_gene$taxa.id,")") ### add order column to order terms by p.value in heatmap byGO_long_term_gene <- byGO_long_term_gene %>% arrange(GO.Slim.Term, over_represented_pvalue) %>% mutate(order = row_number()) ###make the same order number for duplicate terms so that terms found across timepoints can be plotted in the same row #extract the order number for the term with the lowest p.value byGO_long_term_gene_rank <- byGO_long_term_gene %>% group_by(term_gene) %>% dplyr::slice(which.min(order)) #save only the term_gene and order columns so these can be merged with the main df byGO_long_term_gene_rank <- byGO_long_term_gene_rank[,c("term_gene", "order")] #rename the order column that will be merged colnames(byGO_long_term_gene_rank) <- c("term_gene", "order2") #merge the new order numbers with the main df byGO_long_term_gene <- merge(byGO_long_term_gene, byGO_long_term_gene_rank, by = "term_gene") #plot all BP categories FigS5<- byGO_long_term_gene %>% filter(ontology=="BP", TP!= "Persistent") %>% ggplot(., aes(x = TP, y = order2)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 12), strip.text.x = element_text(size = 20), axis.text = element_text(size = 4), axis.title.x = element_blank(), axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2), labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) #Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom") ggsave(file="RAnalysis/Figs/FigS5_BP_all.pdf", FigS5, width = 10, height = 16, units = c("in")) #subset GO slims to reduce heatmap size and focus on those that are not in NA slim or greater than 1-2 terms Fig7<- byGO_long_term_gene %>% filter(ontology=="BP", TP!= "Persistent",GO.Slim.Term %in% c("cell organization and biogenesis","signal transduction", "transport", "developmental processes", "protein metabolism")) %>% ggplot(., aes(x = TP, y = order2)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(factor(GO.Slim.Term, levels = c("cell organization and biogenesis","signal transduction", "developmental processes", "protein metabolism", "transport")) ~ .,scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 12), strip.text.x = element_text(size = 20), axis.text = element_text(size = 4), axis.title.x = element_blank(), axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2), labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) ggsave(file="RAnalysis/Figs/Fig7_BP_subset.pdf", Fig7, width = 8, height = 10, units = c("in")) FigS6<- byGO_long_term_gene %>% filter(ontology=="MF",TP!= "Persistent") %>% ggplot(., aes(x = TP, y = order2)) + geom_tile(aes(fill =over_represented_pvalue)) + facet_grid(GO.Slim.Term ~ .,scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), strip.text.y = element_text(angle=0, size = 12), strip.text.x = element_text(size = 20), axis.text = element_text(size = 4), axis.title.x = element_blank(), axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2), labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) ggsave(file="RAnalysis/Figs/FigS6_MF_all.pdf", FigS6, width = 8, height = 20, units = c("in")) ```