---
title: "Geoduck DNA Methylation"
author: "HPutnam"
date: "05/14/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```

## Load Libraries

```{r message = FALSE, warning = FALSE}
library(plotrix) 
library(ggplot2)
library(gridExtra)
library(seacarb) 
library(pheatmap)
library(dplyr)
library(tidyverse)
library(tidyr)
library(goseq)
library(genefilter)
library(gplots)
library(cowplot)
library(lsmeans)
library(data.table)
library(RColorBrewer)
library(randomcoloR)
#library(GSEABase)
library(ggpubr)
library(hrbrthemes)
library(viridis)

```

Flow Rates
```{r Flow Rates}
# FLOW RATES
#Initial Exposure: Quantifying flow and testing for differences among treatments
Flow1 <- read.csv("RAnalysis/Data/Flow1_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
#flow values measured in ml/10 sec
Flow1$Rate <- (((Flow1$Rate1 + Flow1$Rate2)/2)*6)/1000*60 #calculate flow rate in liters per hour
Flow1.Rate <- aggregate(Rate ~ Treatment, data=Flow1, FUN=mean) #mean flow by treatment
Flow1.Rate$se <- aggregate(Rate ~ Treatment, data=Flow1, FUN=std.error) #sem flow by treatment
mean(Flow1$Rate) #mean flow across all tanks
std.error(Flow1$Rate) #sem flow across all tanks
length(Flow1$Rate) #n flow across all tanks


FL1 <- aov(Rate ~ Treatment, data=Flow1) #test for differences in flow rate between treatments
FL1.res <- anova(FL1) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(FL1$residuals) #plot histogram of residuals
boxplot(FL1$residuals) #plot boxplot of residuals
plot(FL1) #display residuals versus fitter, normal QQ plot, leverage plot


#Secondary Exposure: Quantifying flow and testing for differences among treatments
Flow2 <- read.csv("RAnalysis/Data/Flow2_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
#flow values measured in ml/15 sec
Flow2$Rate <- ((Flow2$Rate1)*4)/1000*60 #calculate flow rate in liters per hour
Flow2.Rate <- aggregate(Rate ~ Treatment, data=Flow2, FUN=mean) #mean flow by treatment
Flow2.Rate$se <- aggregate(Rate ~ Treatment, data=Flow2, FUN=std.error) #sem flow by treatment
mean(Flow2$Rate) #mean flow across all tanks
std.error(Flow2$Rate) #sem flow across all tanks
length(Flow2$Rate) #n flow across all tanks

FL2 <- aov(Rate ~ Treatment, data=Flow2) #test for differences in flow rate between treatments
FL2.res <-anova(FL2) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(FL2$residuals) #plot histogram of residuals
boxplot(FL2$residuals) #plot boxplot of residuals
plot(FL2) #display residuals versus fitter, normal QQ plot, leverage plot
```

Feeding Rates
```{r Feeding Rates}
# FEEDING RATES
#Initial Exposure: Quantifying feeding and testing for differences among treatments
cells.1 <- read.csv("RAnalysis/Data/Cell_Counts_Juvenile_Geoduck_Exp1.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
cells.1$Avg.Cells <- rowMeans(cells.1[,c("Count1",  "Count2")], na.rm = TRUE) #calculate average of counts
cells.1$cell.num <- cells.1$Avg.Cells/cells.1$Volume.Counted #calculate density

avg.cells.tank.1 <- do.call(data.frame,aggregate(cell.num ~ Tank, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Tank
avg.cells.trt.1 <- do.call(data.frame,aggregate(cell.num ~ Treatment, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment
avg.cells.1 <- do.call(data.frame,aggregate(cell.num ~ Volume.Counted, data = cells.1, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment
colnames(avg.cells.tank.1) <- c("Tank", "mean", "se") #rename columns 
colnames(avg.cells.trt.1) <- c("Treatment", "mean", "se") #rename columns 

cells1.tank <- aov(cell.num ~Tank, data=cells.1) #test the hypothesis feeding does not differ between tanks
cells1.tank.res <-anova(cells1.tank) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(cells1.tank$residuals) #plot histogram of residuals
boxplot(cells1.tank$residuals) #plot boxplot of residuals
plot(cells1.tank) #display residuals versus fitter, normal QQ plot, leverage plot

cells1.trt <- aov(cell.num ~Treatment, data=cells.1) #test the hypothesis feeding does not differ between treatments
cells1.trt.res <-anova(cells1.trt) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(cells1.trt$residuals) #plot histogram of residuals
boxplot(cells1.trt$residuals) #plot boxplot of residuals
plot(cells1.trt) #display residuals versus fitter, normal QQ plot, leverage plot

#Secondary Exposure: Quantifying feeding and testing for differences among treatments
cells.2 <- read.csv("RAnalysis/Data/Cell_Counts_Juvenile_Geoduck_Exp2.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
cells.2$cell.num <- rowMeans(cells.2[,c("Count1",  "Count2", "Count3")], na.rm = TRUE) #calculate average of counts
avg.cells.tank.2 <- do.call(data.frame,aggregate(cell.num ~ Tank, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Tank
avg.cells.trt.2 <- do.call(data.frame,aggregate(cell.num ~ Treatment, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment
avg.cells.2 <- do.call(data.frame,aggregate(cell.num ~ Notes, data = cells.2, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each Treatment
colnames(avg.cells.tank.2) <- c("Tank", "mean", "se") #rename columns 
colnames(avg.cells.trt.2) <- c("Treatment", "mean", "se") #rename columns 

cells2.tank <- aov(cell.num ~Tank, data=cells.2) #test the hypothesis feeding does not differ between tanks
cells2.tank.res <-anova(cells2.tank) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(cells2.tank$residuals) #plot histogram of residuals
boxplot(cells2.tank$residuals) #plot boxplot of residuals
plot(cells2.tank) #display residuals versus fitter, normal QQ plot, leverage plot

cells2.trt <- aov(cell.num ~Treatment, data=cells.2) #test the hypothesis feeding does not differ between treatments
cells2.trt.res <-anova(cells2.trt) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(cells2.trt$residuals) #plot histogram of residuals
boxplot(cells2.trt$residuals) #plot boxplot of residuals
plot(cells2.trt) #display residuals versus fitter, normal QQ plot, leverage plot
```

Continuous pH data 
```{r Continuous pH}
#CONTINUOUS EXPERIMENTAL PH DATA 
#DuraFET_pH_data.csv
pH <- read.csv("RAnalysis/Data/DuraFET_pH_data.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
pH$Date.Time <-as.POSIXct(pH$Date.Time, format="%m/%d/%y %H:%M") #convert date format
pH$Date <- as.Date(pH$Date.Time) #convert Date only
pH$Time <- format(as.POSIXct(pH$Date.Time) ,format = "%H:%M:%S") #convert Time only

pH.low <- do.call(data.frame,aggregate(Low ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
pH.med <- do.call(data.frame,aggregate(Medium ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
pH.amb <- do.call(data.frame,aggregate(Ambient ~ Date, data = pH, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
pH.low$Treatment <- "pH 7.0" #Add treatment Information
colnames(pH.low) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
pH.med$Treatment <- "pH 7.4" #Add treatment Information
colnames(pH.med) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
pH.amb$Treatment <- "pH 7.9" #Add treatment Information
colnames(pH.amb) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
daily.pH <- rbind(pH.amb, pH.med, pH.low) #bind treatment data 
daily.pH #view data

# Plot daily averages of pH data for the complete experiment
All.pH <- ggplot(daily.pH, aes(x=Date, y=mean, group=Treatment)) + #set up plot information
  geom_errorbar(aes(ymin=daily.pH$mean-daily.pH$se, ymax=daily.pH$mean+daily.pH$se), colour="black", width=.1, position = position_dodge(width = 0.05)) + #add standard error bars about the mean
  geom_point(aes(shape=Treatment), size = 2, position = position_dodge(width = 0.05)) + #include points in the shape of the treatments
  annotate("text", x=as.Date("2016-03-25"), y=7.95, label = "No Data") + #add text to the graphic where data are missing
  annotate("text", x=as.Date("2016-06-15"), y=7.95, label = "No Data") + #add text to the graphic where data are missing
  xlab("Time") + #label x axis
  ylab("pH Total Scale") + # label y axis
  ylim(6.8,8.1) + # set y axis scale
  geom_vline(xintercept = 16899, linetype="dotted", color = "gray", size=1) + #add vertical line
  geom_vline(xintercept = 16928, linetype="dashed", color = "gray", size=1) + #add vertical line
  geom_vline(xintercept = 17013, linetype="dotted", color = "gray", size=1) + #add vertical line
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(), #Set the plot background
        legend.position=c(.65, .25), #set legend location
        legend.text = element_text(size = 8), #set the legend text size
        legend.key = element_blank(), #remove the legend background
        legend.title = element_text(size=8, face="bold")) + #set legend title attributes
  ggtitle("A) Experimental pH") + #add a main title
  theme(plot.title = element_text(face = 'bold', 
                                  size = 12, 
                                  hjust = 0)) #set title attributes
All.pH #view plot
```

Continuous Temperature data - Avtech
```{r Continuous Temperature}
#CONTINUOUS EXPERIMENTAL TEMPERATURE DATA
#Avtech_temp_data.csv
temp <- read.csv("RAnalysis/Data/Avtech_temp_data.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
temp$Date.Time <-as.POSIXct(temp$Date.Time, format="%m/%d/%y %H:%M") #convert date format
temp$Date <- as.Date(temp$Date.Time) #convert Date only
temp$Time <- format(as.POSIXct(temp$Date.Time) ,format = "%H:%M:%S") #convert time only

temp.low <- do.call(data.frame,aggregate(Low ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
temp.med <- do.call(data.frame,aggregate(Medium ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
temp.amb <- do.call(data.frame,aggregate(Ambient ~ Date, data = temp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem of each treatment by Day
temp.low$Treatment <- "Super Low" #Add treatment Information
colnames(temp.low) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
temp.med$Treatment <- "Low" #Add treatment Information
colnames(temp.med) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
temp.amb$Treatment <- "Ambient" #Add treatment Information
colnames(temp.amb) <- c("Date", "mean", "se", "Treatment") #rename columns to generic format
daily.temp <- rbind(temp.amb, temp.med, temp.low) #bind treatment data 
daily.temp #view data

All.temp <- ggplot(daily.temp, aes(x=Date, y=mean, group=Treatment)) + #set up plot information
  geom_errorbar(aes(ymin=daily.temp$mean-daily.temp$se, ymax=daily.temp$mean+daily.temp$se), colour="black", width=.1, position = position_dodge(width = 0.05)) + #add standard error bars about the mean
  geom_point(aes(shape=Treatment), size = 2, position = position_dodge(width = 0.05)) + #include points in the shape of the treatments
  annotate("text", x=as.Date("2016-05-25"), y=14, label = "No Data") + #add text to the graphic where data are missing
  xlab("Time") + #label x axis
  ylab("Temperature °C") + #label y axis
  ylim(0,20) + # set y axis scale
  geom_vline(xintercept = 16899, linetype="dotted", color = "gray", size=1) + #add vertical line
  geom_vline(xintercept = 16928, linetype="dashed", color = "gray", size=1) + #add vertical line
  geom_vline(xintercept = 17013, linetype="dotted", color = "gray", size=1) + #add vertical line
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(), #Set the plot background
        legend.position = "none") + #remove legend
  ggtitle("B) Experimental Temperature") + #add a main title
  theme(plot.title = element_text(face = 'bold', 
                                  size = 12, 
                                  hjust = 0)) #set title attributes
All.temp #view plot

FigureS1 <- arrangeGrob(All.pH, All.temp, ncol=1)
ggsave(file="RAnalysis/Figs/Supplementary_Figs/Fig.S5.pdf", FigureS1, width = 6, height = 8, units = c("in"))
```

Discrete Seawater Chemistry
```{r Discrete Seawater Chemistry}
#SEAWATER CHEMISTRY ANALYSIS FOR DISCRETE MEASUREMENTS

#pH Tris Calibration Curves
#Data to calculate conversion equations from mV to total scale using tris standard for pH probe
path <-("RAnalysis/Data/pH_Calibration_Files/") #set path to calibration file folder
file.names<-list.files(path = path, pattern = "csv$") #list all the file names with csv 
pH.cals <- data.frame(matrix(NA, nrow=length(file.names), ncol=4, dimnames=list(file.names,c("Date", "Intercept", "Slope","R2")))) #generate an empty 3 column dataframe with specific column names

for(i in 1:length(file.names)) { # for every file in list start at the first and run this following function
  Calib.Data <-read.table(file.path(path,file.names[i]), header=TRUE, sep=",", na.string="NA", as.is=TRUE) #reads in the data files
  model <-lm(mVTris ~ TTris, data=Calib.Data) #runs a linear regression of mV as a function of temperature
  coe <- coef(model) #extracts the coeffecients
  R <- summary(model)$r.squared #extracts the R2
  pH.cals[i,2:3] <- coe #inserts coef in the dataframe
  pH.cals[i,4] <- R #inserts R2 in the dataframe
  pH.cals[i,1] <- substr(file.names[i],1,8) #stores the file name in the Date column
}

colnames(pH.cals) <- c("Calib.Date",  "Intercept",  "Slope", "R2") #names the columns of the dataframe
pH.cals #view data

#Calculate Total pH
SW.chem <- read.csv("RAnalysis/Data/SW_Chem_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
SW.chem <- merge(pH.cals, SW.chem, by="Calib.Date") #merge pH calibrations with Seawater chemistry data
R <- 8.31447215 #gas constant in J mol-1 K-1 
F <-96485.339924 #Faraday constant in coulombs mol-1
mvTris <- SW.chem$Temperature*SW.chem$Slope+SW.chem$Intercept #calculate the mV of the tris standard using the temperature mv relationships in the measured standard curves 
STris<-27.5 #salinity of the Tris from local batch made for Manchester WA salinity conditions in Spring 2016. Batch date = 20160214
phTris<- (11911.08-18.2499*STris-0.039336*STris^2)*(1/(SW.chem$Temperature+273.15))-366.27059+ 0.53993607*STris+0.00016329*STris^2+(64.52243-0.084041*STris)*log(SW.chem$Temperature+273.15)-0.11149858*(SW.chem$Temperature+273.15) #calculate the pH of the tris (Dickson A. G., Sabine C. L. and Christian J. R., SOP 6a)
SW.chem$pH.Total<-phTris+(mvTris/1000-SW.chem$pH.MV/1000)/(R*(SW.chem$Temperature+273.15)*log(10)/F) #calculate the pH on the total scale (Dickson A. G., Sabine C. L. and Christian J. R., SOP 6a)

#calculate mean TA from titrations
Trts.TA <- SW.chem %>%
  group_by(Treatment.Group, Exposure) %>%
  summarise(mean = mean(Total.Alkalinity,na.rm = TRUE),sem=std.error(Total.Alkalinity,na.rm = TRUE), N= sum(!is.na(Total.Alkalinity)))

TA.mod <- aov(Total.Alkalinity ~ Treatment, data=SW.chem)
summary(TA.mod)
par(mfrow=c(1,3))
hist(TA.mod$residuals)
boxplot(TA.mod$residuals)
plot(TA.mod$fitted.values,TA.mod$residuals)

#apply mean TA value per treatment to new column for seacarb calculations
# SW.chem %>%
#   mutate(TA.mean = case_when(Treatment.Group %in% Ambient ~ Trts.TA[1,3], 
#                              Treatment.Group %in% Mid ~ Trts.TA[4,3], 
#                              Treatment.Group %in% High ~ Trts.TA[3,3])) 

#Calculate CO2 parameters using seacarb
carb.output <- carb(flag=8, var1=SW.chem$pH.Total, var2=SW.chem$TA.calc/1000000, S= SW.chem$Salinity, T=SW.chem$Temperature, P=0, Pt=0, Sit=0, pHscale="T", kf="pf", k1k2="l", ks="d") #calculate seawater chemistry parameters using seacarb
carb.output$ALK <- carb.output$ALK*1000000 #convert to µmol kg-1
carb.output$CO2 <- carb.output$CO2*1000000 #convert to µmol kg-1
carb.output$HCO3 <- carb.output$HCO3*1000000 #convert to µmol kg-1
carb.output$CO3 <- carb.output$CO3*1000000 #convert to µmol kg-1
carb.output$DIC <- carb.output$DIC*1000000 #convert to µmol kg-1
carb.output <- cbind(SW.chem$Measure.Date,  SW.chem$Exposure, SW.chem$Sample.ID,  SW.chem$Treatment, carb.output) #combine the sample information with the seacarb output
colnames(carb.output) <- c("Date", "Exposure", "Tank",  "Treatment",	"flag",	"Salinity",	"Temperature","Patm",	"Pressure",	"pH",	"CO2",	"fCO2", "pCO2",	"fCO2pot","pCO2pot","fCOinsitu", "pCOinsitu","HCO3",	"CO3",	"DIC", "TA",	"Aragonite.Sat", 	"Calcite.Sat") #Rename columns to describe contents
carb.output <- subset(carb.output, select= c("Exposure","Date",  "Tank",  "Treatment",	"Salinity",	"Temperature",		"pH",	"CO2",	"pCO2",	"HCO3",	"CO3",	"DIC", "TA",	"Aragonite.Sat"))

#Calculate descriptive stats for seawater chemistry by Exposure
mean.carb.output  <- carb.output %>% 
  group_by(Exposure, Treatment) %>%
   summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"),
            mean) 
sem.carb.output  <- carb.output %>% 
  group_by(Exposure, Treatment) %>%
   summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"),
            std.error) 
N.carb.output  <- carb.output %>% 
  group_by(Exposure, Treatment) %>%
   summarise_at(c("Salinity","Temperature","pH","CO2","pCO2","HCO3","CO3","DIC","TA","Aragonite.Sat"),
            length) 
 
```

Shell Size
```{r Shell Size}
##### Shell Size
##### JUVENILE GEODUCK SHELL SIZE #####

seed.size <- read.csv("RAnalysis/Data/Size_Juvenile_Geoduck.csv", header=TRUE, sep=",", na.strings="NA") #load data with a header, separated by commas, with NA as NA
seed.size$Treatment <- factor(seed.size$Treatment, levels = c("pH 7.9", "pH 7.4", "pH 7.0"))
seed.size$Secondary <- factor(seed.size$Secondary, levels = c("pH 7.9", "pH 7.4", "pH 7.0"))

#Initial Exposure Experimental Component
Initial <- subset(seed.size, Component=="Exp1", select = Date:Area) #subset dataframe to exposure 1 data
Initial$Ratio <- Initial$Length/Initial$Width #calculate the length:width of the shell

#calculating shell area means for Exposure 1 normalizations
E1.norms <- aggregate(Area ~ Day*Treatment, data=Initial, mean) #calculate mean area by Day and Treatment
E1.norm.area.amb <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.9", select = Area) #subset mean area for day 1 ambient condition
E1.norm.area.med <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.4", select = Area) #subset mean area for day 1 medium condition
E1.norm.area.low <- subset(E1.norms, Day=="Day1" & Treatment=="pH 7.0", select = Area) #subset mean area for day 1 low condition

E1.norms <- function(x) {  #write function
  if(x == "pH 7.9") y <- E1.norm.area.amb #if Treatment equals Ambient assign day 1 Ambient mean as normalization factor
  if(x == "pH 7.4") y <- E1.norm.area.med #if Treatment equals Medium assign day 1 Medium mean as normalization factor
  if(x == "pH 7.0") y <- E1.norm.area.low #if Treatment equals Low assign day 1 Low mean as normalization factor
  return(y) #return result
}

Initial$A.norm <- as.numeric(sapply(Initial$Treatment,E1.norms)) #add a column with the normalization values 
Initial$A.rel <- Initial$Area/Initial$A.norm #normalize the area to be relative size
Init.area <- do.call(data.frame,aggregate(Area ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
Init.Len <- do.call(data.frame,aggregate(Length ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
Init.Wid <- do.call(data.frame,aggregate(Width ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
Init.Ratio <- do.call(data.frame,aggregate(Ratio ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
Init.ARel <- do.call(data.frame,aggregate(A.rel ~ Day*Treatment, data = Initial, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
Init.Shell.size <- cbind(Init.area, Init.Len$Length.mean, Init.Len$Length.se, Init.Wid$Width.mean, Init.Wid$Width.se, Init.Ratio$Ratio.mean, Init.Ratio$Ratio.se, Init.ARel$A.rel.mean, Init.ARel$A.rel.se) #bind columns into dataframe
colnames(Init.Shell.size) <- c("Day", "Treatment", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns

# Significance testing for initial exposure
Init <- aov(log10(A.rel) ~ Treatment*Day, data=Initial) #test the hypothesis relative size does not differ between treatments and time
Init.res <-anova(Init) #display anova results
par(mfrow=c(3,2)) #set plotting configuration
par(mar=c(1,1,1,1)) #set margins for plots
hist(Init$residuals) #plot histogram of residuals
boxplot(Init$residuals) #plot boxplot of residuals
plot(Init) #display residuals versus fitted, normal QQ plot, leverage plot

#PostHoc Tukey adjustment comparison of least-squares means
E1.lsm <- lsmeans(Init, ~ Treatment*Day, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model
E1.pairs <- multcomp::cld(E1.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display
E1.pairs #view results

# calculate percent difference in treatment relative to ambient
((Init.Shell.size$avg.Arel[2]-Init.Shell.size$avg.Arel[5])/Init.Shell.size$avg.Arel[2])*100
((Init.Shell.size$avg.Arel[3]-Init.Shell.size$avg.Arel[6])/Init.Shell.size$avg.Arel[3])*100

#Exposure 1 Plotting
Init.Shell.size$Day.Num <- c(1,10,23,1,10,23,1,10,23) #set days as numeric for plotting


Fig.Exp1.size <- ggplot(Init.Shell.size, aes(x=Day.Num, y=avg.Arel, group=Treatment)) + 
  geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.6)) +
  geom_line(aes(linetype=Treatment, colour = Treatment), size = 0.5, position = position_dodge(width = 0.6)) + 
  scale_color_manual(values=c("blue", "mediumvioletred","red")) +
  geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.6)) +
  scale_fill_manual(values=c("blue","mediumvioletred","red")) +
  scale_x_continuous(breaks=seq(0,160,10)) +
  annotate("text", x=9, y=0.73, label = "a", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=1, y=1.2, label = "abc", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=11, y=1, label = "ab", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=c(9,22.5), y=c(1.2,0.95), label = "b", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=22.5, y=1.25, label = "bc", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=22, y=1.6, label = "c", size = 3) + #add text to the graphic for posthoc letters
  xlab("Days") +
  ylab(expression(paste("Relative Size"))) +
  ylim(0.5,3.5) +
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(),  #Set the plot background
        legend.key = element_blank(),  #remove legend background
        legend.position=c(.5, .7)) + #set legend location
  ggtitle("A) Initial Exposure") +
  theme(plot.title = element_text(face = 'bold', 
                                  size = 12, 
                                  hjust = 0))
Fig.Exp1.size

#Common Garden Experimental Component
CG.data <- subset(seed.size, Component=="Common.Garden", select = Date:Area) #subset dataframe to common garden data
CG.data$Ratio <- CG.data$Length/CG.data$Width #calculate the length:width of the shell

#calculating shell area means for Common Garden normalizations
CG.norm.area.amb <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.9", select = avg.area) #subset mean area for day 23 ambient condition
CG.norm.area.med <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.4", select = avg.area) #subset mean area for day 23 medium condition
CG.norm.area.low <- subset(Init.Shell.size, Day=="Day23" & Treatment=="pH 7.0", select = avg.area) #subset mean area for day 23 high condition

CG.norms <- function(x) { 
  if(x == "pH 7.9") y <- CG.norm.area.amb #if Treatment equals Ambient assign day 23 Ambient mean as normalization factor
  if(x == "pH 7.4") y <- CG.norm.area.med #if Treatment equals Medium assign day 23 Medium mean as normalization factor
  if(x == "pH 7.0") y <- CG.norm.area.low #if Treatment equals Low assign day 23 Low mean as normalization factor
  return(y)
}

CG.data$A.norm <- as.numeric(sapply(CG.data$Treatment,CG.norms)) #add a column with the normalization values
CG.data$A.rel <- CG.data$Area/CG.data$A.norm #normalize the area to be relative size
CG.area <- do.call(data.frame,aggregate(Area ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
CG.Len <- do.call(data.frame,aggregate(Length ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
CG.Wid <- do.call(data.frame,aggregate(Width ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
CG.Ratio <- do.call(data.frame,aggregate(Ratio ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
CG.ARel <- do.call(data.frame,aggregate(A.rel ~ Day*Treatment, data = CG.data, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
CG.Shell.size <- cbind(CG.area, CG.Len$Length.mean, CG.Len$Length.se, CG.Wid$Width.mean, CG.Wid$Width.se, CG.Ratio$Ratio.mean, CG.Ratio$Ratio.se, CG.ARel$A.rel.mean, CG.ARel$A.rel.se) #bind columns into dataframe
colnames(CG.Shell.size) <- c("Day", "Treatment", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns

CG.Shell.size <- CG.Shell.size %>%
  filter(Day=="Day135")

CG.ARel <- CG.ARel[c(1,3,5),]

CG.data.135 <- CG.data %>%
  filter(Day=="Day135")

# Significance testing for common garden
CG <- aov(log10(A.rel) ~ Treatment, data=CG.data.135)
CG.res <- anova(CG)
CG.res
par(mfrow=c(3,2))
par(mar=c(1,1,1,1))
hist(CG$residuals)
boxplot(CG$residuals)
plot(CG)

#PostHoc Tukey adjustment comparison of least-squares means
CG.lsm <- lsmeans(CG, ~ Treatment, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model
CG.pairs <- multcomp::cld(CG.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display
CG.pairs #view results

# calculate fold difference in treatment relative to ambient for day 135
CG.Shell.size$avg.Arel[3]/CG.Shell.size$avg.Arel[1] 
CG.Shell.size$avg.Arel[2]/CG.Shell.size$avg.Arel[1] 

#Common Garden Plotting
CG.Shell.size$Day.Num <- c(135,135,135) #set days as numeric for plotting

Fig.CG.size <- ggplot(CG.Shell.size, aes(x=Day, y=avg.Arel, group=Treatment)) + 
  geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.6)) +
  geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.6)) +
  scale_color_manual(values=c("blue", "mediumvioletred","red")) +
  annotate("text", x=0.8, y=2.55, label = "a", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=1, y=3.48, label = "b", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=1.2, y=3.3, label = "b", size = 3) + #add text to the graphic for posthoc letters
  xlab("Treatment") +
  ylab(expression(paste("Relative Size"))) +
  ylim(0.5,3.5) +
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(),  #Set the plot background
        legend.position='none') + #remove legend background
  ggtitle("B) Common Garden") +
  theme(plot.title = element_text(face = 'bold', 
                                  size = 12, 
                                  hjust = 0))
Fig.CG.size

#Exposure 2 Experimental Component
ReExp <- subset(seed.size, Day=="Day145" | Day=="Day158", select = Date:Area) #subset dataframe to exposure 2 data
ReExp$Ratio <- ReExp$Length/ReExp$Width #calculate the length:width of the shell

#calculating mean for Exposure 2 normalizations
E2.norm.area.amb <- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.9", select = avg.area) #subset mean area for day 135 ambient condition
E2.norm.area.med <- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.4", select = avg.area) #subset mean area for day 135 medium condition
E2.norm.area.low<- subset(CG.Shell.size, Day=="Day135" & Treatment=="pH 7.0", select = avg.area) #subset mean area for day 135 low condition

E2.norms <- function(x) { 
  if(x == "pH 7.9") y <- E2.norm.area.amb #if Treatment equals Ambient assign day 135 Ambient mean as normalization factor
  if(x == "pH 7.4") y <- E2.norm.area.med #if Treatment equals Ambient assign day 135 Medium mean as normalization factor
  if(x == "pH 7.0") y <- E2.norm.area.low #if Treatment equals Ambient assign day 135 Low mean as normalization factor
  return(y)
}

ReExp$A.norm <- as.numeric(sapply(ReExp$Treatment,E2.norms)) #add a column with the normalization values
ReExp$A.rel <- ReExp$Area/ReExp$A.norm #normalize the area to be relative size
ReExp.area <- do.call(data.frame,aggregate(Area ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
ReExp.Len <- do.call(data.frame,aggregate(Length ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
ReExp.Wid <- do.call(data.frame,aggregate(Width ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
ReExp.Ratio <- do.call(data.frame,aggregate(Ratio ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
ReExp.ARel <- do.call(data.frame,aggregate(A.rel ~ Treatment*Secondary*Day, data = ReExp, function(x) c(mean = mean(x), se = std.error(x)))) #calculate mean and sem
ReExp.Shell.size <- cbind(ReExp.area, ReExp.Len$Length.mean, ReExp.Len$Length.se, ReExp.Wid$Width.mean, ReExp.Wid$Width.se, ReExp.Ratio$Ratio.mean, ReExp.Ratio$Ratio.se, ReExp.ARel$A.rel.mean, ReExp.ARel$A.rel.se) #bind columns into dataframe
colnames(ReExp.Shell.size) <- c("Treatment", "Secondary", "Day", "avg.area", "se.area","avg.len", "se.len", "avg.wid", "se.wid","avg.ratio", "se.ratio","avg.Arel", "se.Arel") #rename columns

# Significance testing for secondary exposure
Rexps <- aov(log10(A.rel) ~ Day*Treatment*Secondary, data=ReExp)
Rexp.res <- anova(Rexps)
Rexp.res
par(mfrow=c(3,2))
par(mar=c(1,1,1,1))
hist(Rexps$residuals)
boxplot(Rexps$residuals)
plot(Rexps)


Day145 <- subset(ReExp.Shell.size, Day=="Day145", select = Treatment:se.Arel)
Day158 <- subset(ReExp.Shell.size, Day=="Day158", select = Treatment:se.Arel)

#PostHoc
E2.lsm <- lsmeans(Rexps, ~ Day*Treatment*Secondary, adjust="tukey") #compute least-squares means for Treatment*Day from ANOVA model
E2.pairs <- multcomp::cld(E2.lsm, alpha=.05, Letters=letters) #list pairwise tests and letter display
E2.pairs #view results

#ReExp.Shell.size$group <- paste0(ReExp.Shell.size$Treatment, ReExp.Shell.size$Se)

#Exposure2 Plotting
Fig.Exp2.D10.size <- ggplot(Day145, aes(x=Secondary, y=avg.Arel, group=Treatment)) + 
  geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.05)) +
  geom_line(aes(linetype=Treatment, colour=Treatment), size = 0.5, position = position_dodge(width = 0.05)) +
  scale_color_manual(values=c("blue", "mediumvioletred","red")) +
  geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.05)) +
  scale_fill_manual(values=c("blue", "mediumvioletred","red")) +
  annotate("text", x=0.85, y=1.3, label = "b", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=0.85, y=0.88, label = "a", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=c(0.85,2.2,2.2,2.25), y=c(0.94,0.94, 0.88,0.97), label = "ab", size = 3) + #add text to the graphic for posthoc letters
  xlab("Secondary Treatment") +
  ylab("Relative Size") +
  ylim(0.5,1.5) +
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(), #Set the plot background
        legend.position='none') + #remove legend background
  ggtitle("C) Day 145") +
  theme(plot.title = element_text(face = 'bold', 
                                  size = 12, 
                                  hjust = 0))

Fig.Exp2.D10.size

#Exposure2 Plotting
Fig.Exp2.D23.size <- ggplot(Day158, aes(x=Secondary, y=avg.Arel, group=Treatment)) +
  geom_errorbar(aes(ymin=avg.Arel-se.Arel, ymax=avg.Arel+se.Arel), colour="black", width=.1, position = position_dodge(width = 0.05)) +
  geom_line(aes(linetype=Treatment, colour=Treatment), size = 0.5, position = position_dodge(width = 0.05)) +
  scale_color_manual(values=c("blue", "mediumvioletred","red")) +
  geom_point(aes(shape=Treatment, colour=Treatment), size = 3, position = position_dodge(width = 0.05)) +
  scale_fill_manual(values=c("blue","mediumvioletred","red")) +
  annotate("text", x=2.2, y=0.78, label = "a", size = 3) + #add text to the graphic for posthoc letters
  annotate("text", x=c(0.85,0.85,0.85,2.2,2.2), y=c(0.95,1.04,1.15, 0.88,0.97), label = "ab", size = 3) + #add text to the graphic for posthoc letters
  xlab("Secondary Treatment") +
  ylab("Relative Size") +
  ylim(0.5,1.5) +
  theme_bw() + #Set the background color
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=0.5), #Set the text angle
        axis.line = element_line(color = 'black'), #Set the axes color
        panel.border = element_blank(), #Set the border
        panel.grid.major = element_blank(), #Set the major gridlines
        panel.grid.minor = element_blank(), #Set the minor gridlines
        plot.background=element_blank(), #Set the plot background
        legend.position='none') + #remove legend background
  ggtitle("D) Day 158") +
  theme(plot.title = element_text(face = 'bold',
                                  size = 12,
                                  hjust = 0))

Fig.Exp2.D23.size

##### CAPTURE STATISTICAL OUTPUT AND FIGURES TO FILE #####
#Capture Figures to File
Figure1.Size <- arrangeGrob(Fig.Exp1.size,Fig.CG.size, Fig.Exp2.D10.size, Fig.Exp2.D23.size, ncol=4)
ggsave(file="RAnalysis/Figs/Fig3_Geoduck_Size.pdf", Figure1.Size, width = 10, height = 4, units = c("in"))
ggsave(file="RAnalysis/Figs/Fig3_Geoduck_Size.png", Figure1.Size, width = 10, height = 4, units = c("in"))

#Table S2 Phenotype ANOVA Results
# Title
cat("Table_S2_ANOVA_ShellArea\n\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt")
# add 2 newlines
cat("A) ANOVA results of shell area from initial exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
capture.output(Init.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
cat("\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
cat("B) ANOVA results of shell area from common garden exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
# add 2 newlines
capture.output(CG.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
cat("\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
cat("C) ANOVA results of shell area from secondary exposure\n", file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)
# add 2 newlines
capture.output(Rexp.res, file = "RAnalysis/Tables/Supplementary_Tables/Table_S2_ANOVA_ShellArea.txt", append = TRUE)


```

Methylation Analysis
## Loading genomic and annotation
```{r}
#load sample information
sample.info <- read.csv("RAnalysis/Data/Sample.Info.csv", header=T, sep=",", na.string="NA", stringsAsFactors = F) #read in info
samp <- sample.info$Sample.ID # set sample info
samp <- gsub("[_]", "-", samp) #remove extra characters

#load genes gff (see OSF repository: https://osf.io/bcxk7/)
Genes <- read.csv("RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3", header=F, sep="\t", na.string="NA", skip=3, stringsAsFactors = F) #read in data file
Genes <- Genes[,c(1,4,5,9)] #select desired columns only
colnames(Genes) <- c("scaffold", "start", "stop", "gene") #rename columns
Genes$gene <- gsub(";.*","",Genes$gene) #remove extra characters
Genes$gene <- gsub("ID=","",Genes$gene) #remove extra characters

#Load annotation file (see OSF repository)
Annot <- read.csv("RAnalysis/Data/Genome/Panopea-generosa-genes-annotations.tab", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE, skip=0)
colnames(Annot) <- c("gene","scaffold", "start", "end", "uniprot.id", "uniprot_taxa.id", "Uniprot.Name", "GO.IDs", "GO.Descript")
GO.data <- Annot[,c(1,8)] #select names and GOs
splitted <- strsplit(as.character(GO.data$GO.IDs), "; ") #split into multiple GO ids
Gene.GO.IDs <- data.frame(v1 = rep.int(GO.data$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row 
colnames(Gene.GO.IDs) <- c("gene", "GO.IDs") #rename columns
Gene.GO.IDs$GO.IDs <- as.character(Gene.GO.IDs$GO.IDs)
#Gene.GO.IDs$GO.IDs[is.na(Gene.GO.IDs$GO.IDs)] <- "unknown" #replace NA with unknown
GO.IDs <-unique(Gene.GO.IDs$GO.IDs)

# downloaded 20210407: http://www.informatics.jax.org/gotools/data/input/map2MGIslim.txt
GoSlims <- read.csv("RAnalysis/Data/20210407_map2MGIslim.csv", header=TRUE, sep=",", na.string="NA", stringsAsFactors = FALSE, skip=0)
colnames(GoSlims) <- c("GO.IDs", "Cat","GO.Slim.Term")

Gene.GO.IDs.slims <- merge(Gene.GO.IDs, GoSlims, by="GO.IDs", all = TRUE)
Gene.GO.IDs.slims <- Gene.GO.IDs.slims[-1,]

GO.IDs.slims <- Gene.GO.IDs.slims[,-2]

```

## Time dataset
```{bash}
#Time dataset
#mkdir RAnalysis/Data/OSF/Time 


multiIntersectBed -i \
RAnalysis/Data/OSF/Time/EPI-41_S38_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-42_S39_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-43_S40_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-44_S41_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-119_S31_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-120_S32_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-135_S35_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-136_S36_L005_5x.tab \
RAnalysis/Data/OSF/Time/EPI-151_S2_L002_5x.tab \
RAnalysis/Data/OSF/Time/EPI-152_S3_L002_5x.tab \
RAnalysis/Data/OSF/Time/EPI-153_S4_L002_5x.tab \
RAnalysis/Data/OSF/Time/EPI-154_S5_L002_5x.tab \
RAnalysis/Data/OSF/Time/EPI-181_S16_L003_5x.tab \
RAnalysis/Data/OSF/Time/EPI-182_S17_L003_5x.tab \
RAnalysis/Data/OSF/Time/EPI-184_S18_L003_5x.tab \
RAnalysis/Data/OSF/Time/EPI-185_S19_L003_5x.tab \
> RAnalysis/Data/OSF/Time/time.5x.bed

cat RAnalysis/Data/OSF/Time/time.5x.bed | awk '$4 ==16' > RAnalysis/Data/OSF/Time/time.filtered.16.5x.bed 

#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file that ends in posOnly
#b: annotated gene list
#Save output in a new file that has the same base name and ends in -Annotated.txt
for f in RAnalysis/Data/OSF/Time/*5x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \
  > ${f}_gene
done

#intersect with file to subset only those positions found in all samples
for f in RAnalysis/Data/OSF/Time/*_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/Data/OSF/Time/time.filtered.16.5x.bed  \
  > ${f}_CpG_time.bed
done

wc -l RAnalysis/Data/OSF/Time/*_CpG_time.bed
```

## Day10 dataset
```{bash}
# Day 10 Data Set
#mkdir RAnalysis/Data/OSF/D10 

multiIntersectBed -i \
RAnalysis/Data/OSF/D10/EPI-103_S27_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-104_S28_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-111_S29_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-113_S30_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-119_S31_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-120_S32_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-127_S33_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-128_S34_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-135_S35_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-136_S36_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-143_S37_L005_5x.tab \
RAnalysis/Data/OSF/D10/EPI-145_S38_L005_5x.tab \
> RAnalysis/Data/OSF/D10/D10.5x.bed

cat RAnalysis/Data/OSF/D10/D10.5x.bed | awk '$4 ==12' > RAnalysis/Data/OSF/D10/D10.filtered.12.5x.bed 


#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file 
#b: annotated gene list
#Save output in a new file that has the same base name 
for f in RAnalysis/Data/OSF/D10/*5x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \
  > ${f}_gene
done

#intersect with file to subset only those positions found in all samples
for f in RAnalysis/Data/OSF/D10/*_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/Data/OSF/D10/D10.filtered.12.5x.bed   \
  > ${f}_CpG_D10.bed
done

wc -l RAnalysis/Data/OSF/D10/*_CpG_D10.bed
```

## Day135 dataset
```{bash}
# Day 135 Data
#mkdir RAnalysis/Data/OSF/D135 

multiIntersectBed -i \
RAnalysis/Data/OSF/D135/EPI-151_S2_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-152_S3_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-153_S4_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-154_S5_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-159_S6_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-160_S7_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-161_S8_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-162_S9_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-167_S10_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-168_S11_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-169_S12_L002_5x.tab \
RAnalysis/Data/OSF/D135/EPI-170_S13_L002_5x.tab \
> RAnalysis/Data/OSF/D135/D135.5x.bed

cat RAnalysis/Data/OSF/D135/D135.5x.bed | awk '$4 ==12' > RAnalysis/Data/OSF/D135/D135.filtered.12.5x.bed 

#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file 
#b: annotated gene list
#Save output in a new file that has the same base name 
for f in RAnalysis/Data/OSF/D135/*5x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \
  > ${f}_gene
done

#intersect with file to subset only those positions found in all samples
for f in RAnalysis/Data/OSF/D135/*_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/Data/OSF/D135/D135.filtered.12.5x.bed   \
  > ${f}_CpG_D135.bed
done

wc -l RAnalysis/Data/OSF/D135/*_CpG_D135.bed
```

## Day145 dataset
```{bash}
# Day 145 Data

#mkdir RAnalysis/Data/OSF/D145 

multiIntersectBed -i \
RAnalysis/Data/OSF/D145/EPI-175_S14_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-176_S15_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-181_S16_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-182_S17_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-184_S18_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-185_S19_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-187_S20_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-188_S21_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-193_S22_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-194_S23_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-199_S24_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-200_S25_L003_5x.tab \
RAnalysis/Data/OSF/D145/EPI-205_S26_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-206_S27_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-208_S28_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-209_S29_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-214_S30_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-215_S31_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-220_S32_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-221_S33_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-226_S34_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-227_S35_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-229_S36_L004_5x.tab \
RAnalysis/Data/OSF/D145/EPI-230_S37_L004_5x.tab \
> RAnalysis/Data/OSF/D145/D145.5x.bed

cat RAnalysis/Data/OSF/D145/D145.5x.bed | awk '$4 ==24' > RAnalysis/Data/OSF/D145/D145.filtered.24.5x.bed 

#Use intersectBed to find where loci and genes intersect, allowing loci to be mapped to annotated genes
#wb: Print all lines in the second file
#a: file 
#b: annotated gene list
#Save output in a new file that has the same base name 
for f in RAnalysis/Data/OSF/D145/*5x.tab
do
  intersectBed \
  -wb \
  -a ${f} \
  -b RAnalysis/Data/Genome/Panopea-generosa-v1.0.a4.gene.gff3 \
  > ${f}_gene
done

#intersect with file to subset only those positions found in all samples
for f in RAnalysis/Data/OSF/D145/*_gene
do
  intersectBed \
  -a ${f} \
  -b RAnalysis/Data/OSF/D145/D145.filtered.24.5x.bed   \
  > ${f}_CpG_D145.bed
done

wc -l RAnalysis/Data/OSF/D145/*_CpG_D145.bed
```

Load filtered methylation counts (see OSF Repository for all data files)
## Time dataset
```{r}
# Time
meth.data.T <- list.files(path = "RAnalysis/Data/OSF/Time", pattern = "time.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(-c(V3,V7:V14)) %>%
  group_by(Sample.ID)
colnames(meth.data.T) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")

meth.data.T$gene <- gsub(";.*","",meth.data.T$gene) #remove extra characters
meth.data.T$gene <- gsub("ID=","",meth.data.T$gene) #remove extra characters
meth.data.T$Sample.ID <- gsub("RAnalysis/Data/OSF/Time/","",meth.data.T$Sample.ID) #remove extra characters
meth.data.T$Sample.ID <- gsub("_.*","",meth.data.T$Sample.ID) #remove extra characters 
meth.data.T$Sample.ID <- gsub("-","_",meth.data.T$Sample.ID) #remove extra characters

MD.Time <- merge(meth.data.T,sample.info, by="Sample.ID")
```

## Day10 dataset
```{r}
#Day 10
meth.data.D10 <- list.files(path = "RAnalysis/Data/OSF/D10", pattern = "D10.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(-c(V3,V7:V14)) %>%
  group_by(Sample.ID)

colnames(meth.data.D10) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")

meth.data.D10$gene <- gsub(";.*","",meth.data.D10$gene) #remove extra characters
meth.data.D10$gene <- gsub("ID=","",meth.data.D10$gene) #remove extra characters
meth.data.D10$Sample.ID <- gsub("RAnalysis/Data/OSF/D10/","",meth.data.D10$Sample.ID) #remove extra characters
meth.data.D10$Sample.ID <- gsub("_.*","",meth.data.D10$Sample.ID) #remove extra characters 
meth.data.D10$Sample.ID <- gsub("-","_",meth.data.D10$Sample.ID) #remove extra characters

MD.D10 <- merge(meth.data.D10, sample.info, by="Sample.ID")
```

## Day135 dataset
```{r}
#Day 135
meth.data.D135 <- list.files(path = "RAnalysis/Data/OSF/D135", pattern = "D135.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(-c(V3,V7:V14)) %>%
  group_by(Sample.ID)
colnames(meth.data.D135) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")

meth.data.D135$gene <- gsub(";.*","",meth.data.D135$gene) #remove extra characters
meth.data.D135$gene <- gsub("ID=","",meth.data.D135$gene) #remove extra characters
meth.data.D135$Sample.ID <- gsub("RAnalysis/Data/OSF/D135/","",meth.data.D135$Sample.ID) #remove extra characters
meth.data.D135$Sample.ID <- gsub("_.*","",meth.data.D135$Sample.ID) #remove extra characters 
meth.data.D135$Sample.ID <- gsub("-","_",meth.data.D135$Sample.ID) #remove extra characters

MD.D135 <- merge(meth.data.D135, sample.info, by="Sample.ID")
```

## Day145 dataset
```{r}
#Day 145
meth.data.D145 <- list.files(path = "RAnalysis/Data/OSF/D145", pattern = "D145.bed$", full.names=TRUE) %>%
  set_names(.) %>% 
  map_dfr(read.csv,.id="Sample.ID", header=FALSE, sep="\t", na.string="NA", stringsAsFactors = FALSE) %>% 
  dplyr::select(-c(V3,V7:V14)) %>%
  group_by(Sample.ID)
colnames(meth.data.D145) <- c("Sample.ID", "scaffold", "position","per.meth","meth","unmeth","gene")

meth.data.D145$gene <- gsub(";.*","", meth.data.D145$gene) #remove extra characters
meth.data.D145$gene <- gsub("ID=","",meth.data.D145$gene) #remove extra characters
meth.data.D145$Sample.ID <- gsub("RAnalysis/Data/OSF/D145/","",meth.data.D145$Sample.ID) #remove extra characters
meth.data.D145$Sample.ID <- gsub("_.*","",meth.data.D145$Sample.ID) #remove extra characters 
meth.data.D145$Sample.ID <- gsub("-","_",meth.data.D145$Sample.ID) #remove extra characters

MD.D145 <- merge(meth.data.D145, sample.info, by="Sample.ID")
```

GO identification for background set
```{r}
#generate list of all genes identified and background list for GO enrichment analysis
MD.ALL <- rbind(MD.Time, MD.D10, MD.D135, MD.D145)
MD.ALL.Genes <- unique(MD.ALL$gene)
write.csv(MD.ALL.Genes, file = "RAnalysis/Output/MD.ALL.Genes_For_GO_Background.csv")
```

# Testing for Differentially Methylated Genes
## Time
```{r}
# Comparison of Ambient samples through Time
# Binomial GLM to test for differentially methylated genes

sub_meth_table  <- MD.Time
sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)

#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,]

# create data frame to stored results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
  
  #subset the dataframe gene by gene
  sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
  
  # fit glm position model
  fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ TimePoint, 
             data=sub_meth_table1, family=binomial)
  a <- anova(fit, test="Chisq")
  
  # capture summary stats to data frame
  df <- data.frame(gene = sub_meth_table1[1,7],
                   pval.treatment = a$`Pr(>Chi)`[2],
                   #pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene
                   #pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment
                   stringsAsFactors = F)
  
  # bind rows of temporary data frame to the results data frame
  results <- rbind(results, df)
  
}
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.TimePoint <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment

Time.Amb.sig <-results
Time.Amb.sig <- Time.Amb.sig[,c(1,3)]
Time.Amb.sig <- Time.Amb.sig[order(Time.Amb.sig$adj.pval.TimePoint),]
Time.Amb.sig <- Time.Amb.sig[which(Time.Amb.sig$adj.pval.TimePoint<0.05), ]

# Annotation of DMG under ambient conditions through time
Time.Amb.annot <- merge(Time.Amb.sig , Annot, by="gene", all.x=TRUE)
Time.Amb.annot <- Time.Amb.annot[order(Time.Amb.annot$adj.pval.TimePoint),]
Time.Amb.annot <- Time.Amb.annot[!duplicated(Time.Amb.annot$gene),]
write.table(Time.Amb.annot, 'RAnalysis/Output/Supplemental_Tables/TableS10_Time_sig_annot.tsv', sep='\t', row.names=FALSE)


#Sanity check plotting of top DMGs by time
high <- MD.Time %>%	
  filter(gene== Time.Amb.sig$gene[1])
means <- aggregate(per.meth ~ TimePoint, data=high, FUN=mean)
ses <- aggregate(per.meth ~ TimePoint, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(means, aes(x=TimePoint, y=per.meth)) +
  geom_bar(stat="identity", color="black",
             position=position_dodge()) +
    geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2,
                  position=position_dodge(.9)) +
    xlab("TimePoint") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    ylim(0,100)+ #set y limits to 0 and 100%
    scale_fill_manual(values = c("blue", "purple", "red"))+
    theme_bw()

```
## Time GO enrichment
```{r}
# GO Enrichment Analysis of Ambient samples through Time
##### GO enrichment of DMGs #####

ALL<-as.data.frame(MD.ALL.Genes) #set the all gene list
colnames(ALL) <-c("gene") #name the column for merging
DMG <- as.character(Time.Amb.sig$gene) #set the enrichment test list
L <- left_join(ALL, Genes, by="gene") #generate at dataframe to calculate lengths
L$length <- L$stop-L$start # calculated lengths from start and stop
GOAL <- merge(ALL, Gene.GO.IDs.slims,by="gene", all=T) #merge the all gene list with GO ids
GOAL$GO.Slim.Term[is.na(GOAL$GO.Slim.Term)] <- "unknown" #list NA as unknown
splitted <- strsplit(as.character(GOAL$GO.IDs), "; ") #slit into multiple GO ids to get them all in a single row
GO.terms <- data.frame(v1 = rep.int(GOAL$gene, sapply(splitted, length)), v2 = unlist(splitted)) #list all genes with each of their GO terms in a single row 
GO.slimmed.terms <-GOAL[,c(1,4)] #subset slims only

#change to vectors
ALL.vector <-c(t(L$gene))
DMG.vector <-c(t(DMG))
ID.vector <- L$gene
LENGTH.vector <-L$length

gene.vector=as.integer(ALL.vector%in%DMG.vector) #Construct new vector with 1 for DEG and 0 for others
names(gene.vector)=ALL.vector #set names
DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene

#Find enriched GO terms, 
GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)

GO.Time <- GO.wall[order(GO.wall$over_represented_pvalue),]
write.csv(GO.Time , file = "RAnalysis/Output/GO.Time.csv")

colnames(GO.Time)[1] <- "GO.IDs"
GO.Time <- left_join(GO.Time, GO.IDs.slims, by="GO.IDs")
GO.Time <- GO.Time[!duplicated(GO.Time$GO.IDs), ]

sig.GO.Time <- GO.Time %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)
write.csv(sig.GO.Time , file = "RAnalysis/Output/sig.GO.Time.csv")


#save GO enrichment table with gene IDs and protein names

for (i in 1:nrow(sig.GO.Time)){
  sig.GO.Time$gene[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"gene"]), collapse = "; ")
  sig.GO.Time$uniprot[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"uniprot.id"]), collapse = "; ")
  sig.GO.Time$uniprot_taxa.id[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ")
  sig.GO.Time$uniprot.name[i] <- paste(unname(Time.Amb.annot[grep(sig.GO.Time$GO.IDs[i],Time.Amb.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ")
}

write.csv(sig.GO.Time , file = "RAnalysis/Output/Supplemental_Tables/TableS11_sig.GO.Time.csv")


sig.GO.Time.BP <- GO.Time %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  filter(ontology == "BP") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.Time.BP$term <- factor(sig.GO.Time.BP$term, levels = sig.GO.Time.BP$term[nrow(sig.GO.Time.BP):1])

#plot significantly enriched GO terms by Slim Category
TimeGO.BP <-  ggplot(sig.GO.Time.BP, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

sig.GO.Time.MF <- GO.Time %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >1) %>%
  filter(ontology == "MF") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.Time.MF$term <- factor(sig.GO.Time.MF$term, levels = sig.GO.Time.MF$term[nrow(sig.GO.Time.MF):1])

#plot significantly enriched GO terms by Slim Category
TimeGO.MF <-  ggplot(sig.GO.Time.MF, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

Time.GO.Sig <- ggarrange(TimeGO.BP,TimeGO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/TimeGO.pdf", Time.GO.Sig, width = 15, height = 12, units = c("in"))


```

## Day10
```{r}
# Comparison of Treatments at Day 10
# Binomial GLM to test for differentially methylated genes

meth_table  <- MD.D10
sub_meth_table <- meth_table[meth_table$Initial.Treatment == 'Ambient' | meth_table$Initial.Treatment == 'Low' | meth_table$Initial.Treatment == 'Super.Low', ]
sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)
  
#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,]

# create data frame to stored results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
  
  #subset the dataframe gene by gene
  sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
  
  # fit glm position model
  fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment, 
             data=sub_meth_table1, family=binomial, maxit = 100)
  a <- anova(fit, test="Chisq")
  
  # capture summary stats to data frame
  df <- data.frame(gene = sub_meth_table1[1,7],
                   pval.treatment = a$`Pr(>Chi)`[2],
                   #pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene
                   #pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment
                   stringsAsFactors = F)
  
  # bind rows of temporary data frame to the results data frame
  results <- rbind(results, df)
  
}
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.Initial.Treatment <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment

D10.sig <-results
D10.sig <- D10.sig[,c(1,3)]
D10.sig <- D10.sig[order(D10.sig$adj.pval.Initial.Treatment),]
D10.sig <- D10.sig[which(D10.sig$adj.pval.Initial.Treatment<0.05), ]

write.table(D10.sig, 'RAnalysis/Output/D10_sig.tsv', sep='\t', row.names=FALSE)

# Annotation of D10 DMG 
D10.sig.annot <- merge(D10.sig, Annot, by="gene", all.x=TRUE)
D10.sig.annot <- D10.sig.annot[order(D10.sig.annot$adj.pval.Initial.Treatment),]
D10.sig.annot <- D10.sig.annot[!duplicated(D10.sig.annot$gene),]
write.table(D10.sig.annot, 'RAnalysis/Output/Supplemental_Tables/TableS12_D10_sig_annot.tsv', sep='\t', row.names=FALSE)

#Sanity check plotting of top DMGs
high <- MD.D10 %>%	
  filter(gene== D10.sig$gene[1])
means <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=mean)
ses <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(means, aes(x=Initial.Treatment, y=per.meth, fill=Initial.Treatment)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2,
position=position_dodge(.9)) +
xlab("Treatment") + #plot x axis label
ylab("Percent Methylation") + #plot y axis label
ylim(0,100)+ #set y limits to 0 and 100%
scale_fill_manual(values = c("blue", "purple", "red"))+
theme_bw()

```
## Day10 GO enrichment
```{r}
# GO Enrichment Analysis of Treatment at Day 10
##### GO enrichment of DMGs #####

DMG.D10 <- as.character(D10.sig$gene) #set the enrichment test list
DMG.D10.vector <-c(t(DMG.D10)) #change to vectors
gene.vector=as.integer(ALL.vector%in%DMG.D10.vector) #Construct new vector with 1 for DEG and 0 for others
names(gene.vector)=ALL.vector #set names
DEG.D10.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene

#Find enriched GO terms, 
GO.D10.wall<-goseq(DEG.D10.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)

GO.D10 <- GO.D10.wall[order(GO.D10.wall$over_represented_pvalue),]
write.csv(GO.D10 , file = "RAnalysis/Output/GO.D10.csv")

colnames(GO.D10)[1] <- "GO.IDs"
GO.D10 <- left_join(GO.D10, GO.IDs.slims, by="GO.IDs")
GO.D10 <- GO.D10[!duplicated(GO.D10$GO.IDs), ]

sig.GO.D10 <- GO.D10 %>%
  filter(over_represented_pvalue <0.05) %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)
write.csv(sig.GO.D10 , file = "RAnalysis/Output/sig.GO.D10.csv")

#save GO enrichment table with gene IDs and protein names

for (i in 1:nrow(sig.GO.D10)){
  sig.GO.D10$gene[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"gene"]), collapse = "; ")
  sig.GO.D10$uniprot[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"uniprot.id"]), collapse = "; ")
  sig.GO.D10$uniprot_taxa.id[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ")
  sig.GO.D10$uniprot.name[i] <- paste(unname(D10.sig.annot[grep(sig.GO.D10$GO.IDs[i],D10.sig.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ")
}

write.csv(sig.GO.D10 , file = "RAnalysis/Output/Supplemental_Tables/TableS13_sig.GO.D10.csv")


sig.GO.D10.BP <- GO.D10 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  filter(ontology == "BP") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D10.BP$term <- factor(sig.GO.D10.BP$term, levels = sig.GO.D10.BP$term[nrow(sig.GO.D10.BP):1])

#plot significantly enriched GO terms by Slim Category
D10GO.BP <-  ggplot(sig.GO.D10.BP, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

sig.GO.D10.MF <- GO.D10 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >1) %>%
  filter(ontology == "MF") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D10.MF$term <- factor(sig.GO.D10.MF$term, levels = sig.GO.D10.MF$term[nrow(sig.GO.D10.MF):1])

#plot significantly enriched GO terms by Slim Category
D10GO.MF <-  ggplot(sig.GO.D10.MF, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

D10.GO.Sig <- ggarrange(D10GO.BP,D10GO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/GO.D10.pdf", D10.GO.Sig, width = 15, height = 12, units = c("in"))


```

```{r}
# Comparison of Treatments at Day 135
# Binomial GLM to test for differentially methylated genes

meth_table  <- MD.D135
sub_meth_table <- meth_table[meth_table$Initial.Treatment == 'Ambient' | meth_table$Initial.Treatment == 'Low' | meth_table$Initial.Treatment == 'Super.Low', ]

sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)

#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,]

# create data frame to store results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
  
  #subset the dataframe gene by gene
  sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
  
  #if(length(unique(sub_meth_table1$position)) >1){
  
  # fit glm position model
  fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment, 
             data=sub_meth_table1, family=binomial)
  #} else {
  #  # fit glm model
  #  fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment * position, 
  #             data=sub_meth_table1, family=binomial)
  #}
  #s <- step(fit, trace=0)
  a <- anova(fit, test="Chisq")
  a
  # capture summary stats to data frame
  df <- data.frame(gene = sub_meth_table1[1,7],
                   pval.treatment = a$`Pr(>Chi)`[2],
                   #pval.position = a$`Pr(>Chi)`[3],
                   #pval.treatment_x_position = a$`Pr(>Chi)`[4],
                   stringsAsFactors = F)
  
  # bind rows of temporary data frame to the results data frame
  results <- rbind(results, df)
  
}
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.treatment <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH')
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH')

D135.sig <-results
D135.sig <- D135.sig[,c(1,3)]
D135.sig <- D135.sig[order(D135.sig$adj.pval.treatment),]
D135.sig <- D135.sig[which(D135.sig$adj.pval.treatment<0.05), ]
write.table(D135.sig, 'RAnalysis/Output/D135_sig.tsv', sep='\t', row.names=FALSE)

# Annotation of D135 DMG 
D135.sig.annot <- merge(D135.sig, Annot, by="gene", all.x=TRUE)
D135.sig.annot <- D135.sig.annot[order(D135.sig.annot$adj.pval.treatment),]
D135.sig.annot <- D135.sig.annot[!duplicated(D135.sig.annot$gene),]
write.table(D135.sig.annot, 'RAnalysis/Output/Supplemental_Tables/TableS14_D135_sig_annot.tsv', sep='\t', row.names=FALSE)

#Sanity check plotting of top DMGs
high <- MD.D135 %>%	
  filter(gene== D135.sig$gene[1])
means <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=mean)
ses <- aggregate(per.meth ~ Initial.Treatment, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(means, aes(x=Initial.Treatment, y=per.meth, fill=Initial.Treatment)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), width=.2,
position=position_dodge(.9)) +
xlab("Treatment") + #plot x axis label
ylab("Percent Methylation") + #plot y axis label
ylim(0,100)+ #set y limits to 0 and 100%
scale_fill_manual(values = c("blue", "purple", "red"))+
theme_bw()
```

```{r}
# GO Enrichment Analysis of Treatment at Day 135
##### GO enrichment of DMGs #####
DMG.D135 <- as.character(D135.sig$gene) #set the enrichment test list

DMG.D135.vector <-c(t(DMG.D135)) #change to vectors
gene.vector=as.integer(ALL.vector%in%DMG.D135.vector) #Construct new vector with 1 for DEG and 0 for others
names(gene.vector)=ALL.vector #set names
DEG.D135.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene

#Find enriched GO terms, 
GO.D135.wall<-goseq(DEG.D135.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)

GO.D135 <- GO.D135.wall[order(GO.D135.wall$over_represented_pvalue),]
write.csv(GO.D135 , file = "RAnalysis/Output/GO.D135.csv")

colnames(GO.D135)[1] <- "GO.IDs"
GO.D135 <- left_join(GO.D135, GO.IDs.slims, by="GO.IDs")
GO.D135 <- GO.D135[!duplicated(GO.D135$GO.IDs), ]

sig.GO.D135 <- GO.D135 %>%
  filter(over_represented_pvalue <0.05) %>%
  #filter(numInCat >5) %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)
write.csv(sig.GO.D135 , file = "RAnalysis/Output/sig.GO.D135.csv")

#save GO enrichment table with gene IDs and protein names
#for D135
for (i in 1:nrow(sig.GO.D135)){
  sig.GO.D135$gene[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"gene"]), collapse = "; ")
  sig.GO.D135$uniprot[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"uniprot.id"]), collapse = "; ")
  sig.GO.D135$uniprot_taxa.id[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ")
  sig.GO.D135$uniprot.name[i] <- paste(unname(D135.sig.annot[grep(sig.GO.D135$GO.IDs[i],D135.sig.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ")
}
write.csv(sig.GO.D135 , file = "RAnalysis/Output/Supplemental_Tables/TableS15_sig.GO.D135.csv")


sig.GO.D135.BP <- GO.D135 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  filter(ontology == "BP") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D135.BP$term <- factor(sig.GO.D135.BP$term, levels = sig.GO.D135.BP$term[nrow(sig.GO.D135.BP):1])

#plot significantly enriched GO terms by Slim Category
D135GO.BP <-  ggplot(sig.GO.D135.BP, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

sig.GO.D135.MF <- GO.D135 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >1) %>%
  filter(ontology == "MF") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D135.MF$term <- factor(sig.GO.D135.MF$term, levels = sig.GO.D135.MF$term[nrow(sig.GO.D135.MF):1])

#plot significantly enriched GO terms by Slim Category
D135GO.MF <-  ggplot(sig.GO.D135.MF, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

D135.GO.Sig <- ggarrange(D135GO.BP,D135GO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/GO.D135.pdf", D135.GO.Sig, width = 15, height = 12, units = c("in"))
```

```{r}
# Comparison of Treatments at Day 145
# Binomial GLM to test for differentially methylated genes
meth_table <- MD.D145
sub_meth_table <- meth_table[meth_table$Initial.Treatment == "Ambient" | meth_table$Initial.Treatment == "Low" | meth_table$Initial.Treatment == "Super.Low"
                             | meth_table$Secondary.Treatment == "Ambient" | meth_table$Secondary.Treatment == "Low", ]
sub_meth_table$group <- paste0(sub_meth_table$Sample.ID, sub_meth_table$gene)

#filter for genes with >5 methylated positions
min.filt <- count(sub_meth_table, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- sub_meth_table[sub_meth_table$group %in% newdata$vars,]

# create data frame to store results
results <- data.frame()
gs <- unique(sub_meth_table$gene)

#first subset the unique dataframes and second run the GLMs
for(i in 1:length(sub_meth_table$gene)){
    
    #subset the dataframe gene by gene
    sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
    
    # fit glm position model
    fit <- glm(matrix(c(meth, unmeth), ncol=2) ~ Initial.Treatment * Secondary.Treatment, 
               data=sub_meth_table1, family=binomial)

    a <- anova(fit, test="LRT")
    
    # capture summary stats to data frame
    df <- data.frame(gene = sub_meth_table1[1,7],
                     pval.treatment1 = a$`Pr(>Chi)`[2],
                     pval.treatment2 = a$`Pr(>Chi)`[3],
                     #position = a$`Pr(>Chi)`[2], 
                     pval.treatment1_x_treatment2 = a$`Pr(>Chi)`[4],
                     #pval.treatment1_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     #pval.treatment1_x_pval.treatment2_x_position = a$`Pr(>Chi)`[2],
                     stringsAsFactors = F)

    # bind rows of temporary data frame to the results data frame
    results <- rbind(results, df)
    
  }
# An error will be generated here for contrasts. 
#This potential for contrasts (interactions) is included in the case one wants to examine the role of position of CpG within a gene
#Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels
#Continuing the analysis from results line will generate the results in the absence of the contrast (interaction).

results[is.na(results)] <- 0

results$adj.pval.treatment1 <- p.adjust(results$pval.treatment1, method='BH')
results$adj.pval.treatment2 <- p.adjust(results$pval.treatment2, method='BH')
results$adj.pval.treatment1_x_treatment2 <- p.adjust(results$pval.treatment1_x_treatment2, method='BH')
#results$adj.pval.treatment1_x_position <- p.adjust(results$pval.treatment1_x_position, method='BH')
#results$adj.pval.treatment2_x_position <- p.adjust(results$pval.treatment2_x_position, method='BH')
#results$adj.pval.treatment1_x_pval.treatment2_x_position <- p.adjust(results$pval.treatment1_x_pval.treatment2_x_position, method='BH')

D145.sig <-results
D145.sig <- D145.sig[,c(1,5:7)]

# Identifying DMG with significant interaction
D145.sig.int <- D145.sig[order(D145.sig$adj.pval.treatment1_x_treatment2),]
D145.sig.int <- D145.sig.int[which(D145.sig$adj.pval.treatment1_x_treatment2<0.05), ]
write.table(D145.sig, 'RAnalysis/Output/D145_sig.int.tsv', sep='\t', row.names=FALSE)

# Annotation of Secondary Exposure Interaction DMG 
D145.sig.int.annot <- merge(D145.sig.int, Annot, by="gene", all.x=TRUE)
D145.sig.int.annot <- D145.sig.int.annot[order(D145.sig.int.annot$adj.pval.treatment1_x_treatment2),]
D145.sig.int.annot <- D145.sig.int.annot[!duplicated(D145.sig.int.annot$gene),]
write.table(D145.sig.int.annot, 'RAnalysis/Output/Supplemental_Tables/TableS16_D145_sig_annot_Interactions.tsv', sep='\t', row.names=FALSE)


#Sanity check plotting of top DMGs with sig interaction
high <- MD.D145 %>%	
  filter(gene== D145.sig$gene[1])
means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean)
ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("blue", "purple", "red")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Secondary Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()

# Identifying DMG with significant main effect of Exposure 1
D145.sig.Exp1 <- D145.sig[which(D145.sig$adj.pval.treatment1<0.05), ]
write.table(D145.sig.Exp1, 'RAnalysis/Output/D145_sig.Exp1.tsv', sep='\t', row.names=FALSE)

# Annotation of Main Effect of Initial for 145 DMGs 
D145.sig.Exp1.annot <- merge(D145.sig.Exp1 , Annot, by="gene", all.x=TRUE)
D145.sig.Exp1.annot <- D145.sig.Exp1.annot[!duplicated(D145.sig.Exp1.annot$gene),]
write.table(D145.sig.Exp1.annot , 'RAnalysis/Output/D145_sig_annot_Exposure1.tsv', sep='\t', row.names=FALSE)

#Sanity check plotting of top DMGs with sig Exposure 1
high <- MD.D145 %>%	
  filter(gene== D145.sig.Exp1$gene[1])
means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean)
ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("blue", "purple", "red")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Secondary Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()


# Identifying DMG with significant main effect of Exposure 2
D145.sig.Exp2 <- D145.sig[which(D145.sig$adj.pval.treatment2<0.05), ]
write.table(D145.sig.Exp2, 'RAnalysis/Output/D145_sig.Exp2.tsv', sep='\t', row.names=FALSE)

# Annotation of Main Effect of Secondary for 145 DMGs 
D145.sig.Exp2.annot <- merge(D145.sig.Exp2 , Annot, by="gene", all.x=TRUE)
D145.sig.Exp2.annot <- D145.sig.Exp2.annot[!duplicated(D145.sig.Exp2.annot$gene),]
#write.table(D145.sig.Exp2.annot , 'RAnalysis/Output/D145_sig_annot_Exposure2.tsv', sep='\t', row.names=FALSE)

#Sanity check plotting of top DMGs with sig Exposure 1
high <- MD.D145 %>%	
  filter(gene== D145.sig.Exp2$gene[66])
means <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=mean)
ses <- aggregate(per.meth ~ Initial.Treatment*Secondary.Treatment, data=high, FUN=std.error)
means$ses <- ses$per.meth
ggplot(data=means, aes(x=Secondary.Treatment, y=per.meth, group=Initial.Treatment, colour=Initial.Treatment, shape=Initial.Treatment)) + #plot data
    geom_line(size=0.7, position=position_dodge(.1)) + #plot lines
    scale_colour_manual(values = c("blue", "purple", "red")) + #set line color
    geom_point(size=4, position=position_dodge(.1), colour="black") + #set point characteristics
    scale_shape_manual(values=c(1,18,16)) + #set shapes
    geom_errorbar(aes(ymin=per.meth-ses, ymax=per.meth+ses), #plot error bars
    width=0, position=position_dodge(.1), colour="black") + #set error bar characteristics
    xlab("Secondary Treatment") + #plot x axis label
    ylab("Percent Methylation") + #plot y axis label
    theme_bw()


```

```{r}
# GO Enrichment Analysis of Treatment at Day 145
##### GO enrichment of DMGs #####
DMG.D145 <- as.character(D145.sig.int$gene) #set the enrichment test list

DMG.D145.vector <-c(t(DMG.D145)) #change to vectors
gene.vector=as.integer(ALL.vector%in%DMG.D145.vector) #Construct new vector with 1 for DEG and 0 for others
names(gene.vector)=ALL.vector #set names
DEG.D145.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene

#Find enriched GO terms, 
GO.D145.wall<-goseq(DEG.D145.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)

GO.D145 <- GO.D145.wall[order(GO.D145.wall$over_represented_pvalue),]
write.csv(GO.D145 , file = "RAnalysis/Output/GO.D145.Interaction.csv")

colnames(GO.D145)[1] <- "GO.IDs"
GO.D145 <- left_join(GO.D145, GO.IDs.slims, by="GO.IDs")
GO.D145 <- GO.D145[!duplicated(GO.D145$GO.IDs), ]

sig.GO.D145 <- GO.D145 %>%
  filter(over_represented_pvalue <0.05) %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)
write.csv(sig.GO.D145 , file = "RAnalysis/Output/sig.GO.D145.csv")

#save GO enrichment table with gene IDs and protein names
for (i in 1:nrow(sig.GO.D145)){
  sig.GO.D145$gene[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"gene"]), collapse = "; ")
  sig.GO.D145$uniprot[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"uniprot.id"]), collapse = "; ")
  sig.GO.D145$uniprot_taxa.id[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ")
  sig.GO.D145$uniprot.name[i] <- paste(unname(D145.sig.int.annot[grep(sig.GO.D145$GO.IDs[i],D145.sig.int.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ")
}

write.csv(sig.GO.D145 , file = "RAnalysis/Output/Supplemental_Tables/TableS17_sig.GO.D145.Interactions.csv")


sig.GO.D145.BP <- GO.D145 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  filter(ontology == "BP") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D145.BP$term <- factor(sig.GO.D145.BP$term, levels = sig.GO.D145.BP$term[nrow(sig.GO.D145.BP):1])

#plot significantly enriched GO terms by Slim Category
D145GO.BP <-  ggplot(sig.GO.D145.BP, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

sig.GO.D145.MF <- GO.D145 %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >1) %>%
  filter(ontology == "MF") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.D145.MF$term <- factor(sig.GO.D145.MF$term, levels = sig.GO.D145.MF$term[nrow(sig.GO.D145.MF):1])

#plot significantly enriched GO terms by Slim Category
D145GO.MF <-  ggplot(sig.GO.D145.MF, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

D145.GO.Sig <- ggarrange(D145GO.BP,D145GO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/GO.D145.pdf", D145.GO.Sig, width = 15, height = 12, units = c("in"))

```

```{r}
##### Persistent DMGs #####

#Persistent from first exposure throughout common garden
ints_10_135 <- merge(D10.sig, D135.sig, by="gene")
ints_10_135.annot <- merge(ints_10_135, Annot, by="gene", all.x=TRUE)
ints_10_135.annot <- ints_10_135.annot[!duplicated(ints_10_135.annot$gene),]
ints_10_135.annot <- na.omit(ints_10_135.annot)

#Persistent throughout all timepoints
ints_all <- merge(ints_10_135, D145.sig, by="gene")
ints_all.annot <- merge(ints_all, Annot, by="gene", all.x=TRUE)
ints_all.annot <- ints_all.annot[!duplicated(ints_all.annot$gene),]
write.table(ints_all.annot, 'RAnalysis/Output/Supplemental_Tables/TableS18_DMGs_persist.tsv', sep='\t', row.names=FALSE)
ints_all.annot <- na.omit(ints_all.annot)

```

```{r}
# GO Enrichment Analysis of DMG present in all comparisons
#### GO enrichment of DMGs #####
DMG.Persist <- as.character(ints_all.annot$gene) #set the enrichment test list

#change contig lists to vectors
DMG.vector <-c(t(DMG.Persist))

gene.vector=as.integer(ALL.vector%in%DMG.vector) #Construct new vector with 1 for DEG and 0 for others
names(gene.vector)=ALL.vector #set names
DEG.pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector) #weight vector by length of gene

#Find enriched GO terms, 
GO.wall<-goseq(DEG.pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)

GO.Persist <- GO.wall[order(GO.wall$over_represented_pvalue),]
write.csv(GO.Persist  , file = "RAnalysis/Output/GO.Persist .csv")

colnames(GO.Persist )[1] <- "GO.IDs"
GO.Persist <- left_join(GO.Persist, GO.IDs.slims, by="GO.IDs")
GO.Persist  <- GO.Persist [!duplicated(GO.Persist$GO.IDs), ]

sig.GO.Persist <- GO.Persist %>%
  filter(over_represented_pvalue <0.05) %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)
write.csv(sig.GO.Persist , file = "RAnalysis/Output/sig.GO.Persist.csv")


#save GO enrichment table with gene IDs and protein names
for (i in 1:nrow(sig.GO.Persist)){
  sig.GO.Persist$gene[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"gene"]), collapse = "; ")
  sig.GO.Persist$uniprot[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"uniprot.id"]), collapse = "; ")
  sig.GO.Persist$uniprot_taxa.id[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"uniprot_taxa.id"]), collapse = "; ")
  sig.GO.Persist$uniprot.name[i] <- paste(unname(ints_all.annot[grep(sig.GO.Persist$GO.IDs[i],ints_all.annot$GO.IDs),"Uniprot.Name"]), collapse = "; ")
}

write.csv(sig.GO.Persist , file = "RAnalysis/Output/Supplemental_Tables/TableS19_sig.GO.Persistent.annot.csv")


sig.GO.Persist.BP <- GO.Persist %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >5) %>%
  filter(ontology == "BP") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.Persist.BP$term <- factor(sig.GO.Persist.BP$term, levels = sig.GO.Persist.BP$term[nrow(sig.GO.Persist.BP):1])

#plot significantly enriched GO terms by Slim Category
PersistGO.BP <-  ggplot(sig.GO.Persist.BP, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

sig.GO.Persist.MF <- GO.Persist %>%
  filter(over_represented_pvalue <0.05) %>%
  filter(numInCat >1) %>%
  filter(ontology == "MF") %>%
  arrange(., ontology, GO.Slim.Term, over_represented_pvalue)

sig.GO.Persist.MF$term <- factor(sig.GO.Persist.MF$term, levels = sig.GO.Persist.MF$term[nrow(sig.GO.D145.MF):1])

#plot significantly enriched GO terms by Slim Category
PersistGO.MF <-  ggplot(sig.GO.Persist.MF, aes(x = ontology, y = term)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 10),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/GO.Persist.pdf", Persist.GO.Sig, width = 15, height = 12, units = c("in"))

```

```{r}
##### Heatmap of DMG in Ambient by time #####
Time.Amb.meth_table.means <- aggregate(per.meth ~ TimePoint*gene, data=MD.Time, FUN=mean)
Time.Amb.meth_table.means <- Time.Amb.meth_table.means[Time.Amb.meth_table.means$gene %in% Time.Amb.sig$gene,]

Time.Amb <- Time.Amb.meth_table.means %>% pivot_wider(names_from = c(TimePoint), values_from = per.meth)
Time.Amb.mat <- as.matrix(Time.Amb[,-1])


#new code as of Feb 22, 2021 (S.A.T.)
pdf("RAnalysis/Output/Time.DMG_Rel_HEATMAP_v2.pdf")
heatmap.2(Time.Amb.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() {
  breaks = pretty(parent.frame()$breaks)
  #breaks = breaks[c(1,length(breaks))]
  list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=FALSE, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE, lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4))
dev.off()

```

```{r}
##### Heatmap of DMG in D10 #####
D10.meth_table.means <- aggregate(per.meth ~ Initial.Treatment*gene, data=MD.D10, FUN=mean)
D10.meth_table.means <- D10.meth_table.means[D10.meth_table.means$gene %in% D10.sig$gene,]

D10 <- D10.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment), values_from = per.meth)
D10.mat <- as.matrix(D10[,-1])

col.order <- c( "Ambient",
                "Low", 
                "Super.Low")
D10.mat <- D10.mat[,col.order]

#new code as of Feb 22, 2021 (S.A.T.)
colnames(D10.mat) <- c("pH 7.9", "pH 7.4", "pH 7.0")

pdf("RAnalysis/Output/D10.DMG_Rel_HEATMAP_v2.pdf")
heatmap.2(D10.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() {
  breaks = pretty(parent.frame()$breaks)
  #breaks = breaks[c(1,length(breaks))]
  list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=NA, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4))
dev.off()


```

```{r}
##### Heatmap of DMG in D135 #####
D135.meth_table.means <- aggregate(per.meth ~ Initial.Treatment*gene, data=MD.D135, FUN=mean)
D135.meth_table.means <- D135.meth_table.means[D135.meth_table.means$gene %in% D135.sig$gene,]

D135 <- D135.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment), values_from = per.meth)
D135.mat <- as.matrix(D135[,-1])

col.order <- c( "Ambient",
                "Low", 
                "Super.Low")
D135.mat <- D135.mat[,col.order]

#new code as of Feb 22, 2021 (S.A.T.)

colnames(D135.mat) <- c("pH 7.9", "pH 7.4", "pH 7.0")

pdf("RAnalysis/Output/D135.DMG_Rel_HEATMAP_v2.pdf")
heatmap.2(D135.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() {
  breaks = pretty(parent.frame()$breaks)
  #breaks = breaks[c(1,length(breaks))]
  list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=NA, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4))
dev.off()

```

```{r}
##### Heatmap of DMG in D145 #####
D145.meth_table.means <- aggregate(per.meth ~ gene*Initial.Treatment*Secondary.Treatment, data=MD.D145, FUN=mean)
D145.meth_table.means <- D145.meth_table.means[D145.meth_table.means$gene %in% D145.sig$gene,]

D145 <- D145.meth_table.means %>% pivot_wider(names_from = c(Initial.Treatment, Secondary.Treatment), values_from = per.meth)
D145.mat <- as.matrix(D145[,-1])

col.order <- c("Ambient_Ambient", "Ambient_Low", "Low_Ambient", "Low_Low", "Super.Low_Ambient", "Super.Low_Low")
D145.mat  <- D145.mat[,col.order]

#new code as of Feb 22, 2021 (S.A.T.)
colnames(D145.mat) <- c("pH 7.9 -> pH 7.9", "pH 7.9 -> pH 7.4","pH 7.4 -> pH 7.9","pH 7.4 -> pH 7.4", "pH 7.0 -> pH 7.9", "pH 7.0 -> pH 7.4")

pdf("RAnalysis/Output/D145.DMG_Rel_HEATMAP_v2.pdf")
heatmap.2(D145.mat,margins = c(5,20),cexCol = 1.5, distfun = function(x) as.dist(1 - cor(t(x), use = "pa")), hclustfun = function(x) hclust(x,method = 'average'),key.xtickfun = function() {
  breaks = pretty(parent.frame()$breaks)
  #breaks = breaks[c(1,length(breaks))]
  list(at = parent.frame()$scale01(breaks),labels = breaks)},Colv=FALSE, col= rev(RColorBrewer::brewer.pal(name = "RdYlBu", n = 10)), density.info = "none", trace = "none", scale = "row", labRow = FALSE,sepcolor="white",sepwidth=c(0.1,0.01),colsep=c(0,2,4,6), lmat = rbind(c(0,3),c(2,1),c(4,0)),keysize=1, key.par = list(cex=0.65),lhei=c(1.5,4,1), lwid = c(1.5,4))
dev.off()

```

```{r}
### Magnitude of change in secondary exposure

D145.mat.1 <- D145.mat[,c(1:2)]
D145.mat.1 <- as.data.frame(D145.mat.1)
D145.mat.1$gene <-rownames(D145.mat.1)
D145.mat.1$diff <- D145.mat.1$`pH 7.9 -> pH 7.9` - D145.mat.1$`pH 7.9 -> pH 7.4`
A <- round(mean(D145.mat.1$diff),2)
std.error(D145.mat.1$diff)
var(D145.mat.1$diff)

D145.mat.2 <- D145.mat[,c(3:4)]
D145.mat.2 <- as.data.frame(D145.mat.2)
D145.mat.2$gene <-rownames(D145.mat.2)
D145.mat.2$diff <- D145.mat.2$`pH 7.4 -> pH 7.9` - D145.mat.2$`pH 7.4 -> pH 7.4`
M <-round(mean(D145.mat.2$diff),2)
std.error(D145.mat.2$diff)
var(D145.mat.2$diff)

D145.mat.3 <- D145.mat[,c(5:6)]
D145.mat.3 <- as.data.frame(D145.mat.3)
D145.mat.3$gene <-rownames(D145.mat.3)
D145.mat.3$diff <- D145.mat.3$`pH 7.0 -> pH 7.9` - D145.mat.3$`pH 7.0 -> pH 7.4`
L <-round(mean(D145.mat.3$diff), 2)
std.error(D145.mat.3$diff)
var(D145.mat.3$diff)

labs1 <- c(A, M, L)

A
L
M
```


Plotting Percent methylation for Persistent DMGs
```{r}
##### Heatmap  #####

#D10
Persistant.meth_table.means.D10 <- MD.ALL %>% filter(TimePoint=="Day10") %>%
  group_by(gene, Initial.Treatment) %>%
  summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth)))

Persistant.meth_table.means.D10  <- Persistant.meth_table.means.D10 [Persistant.meth_table.means.D10 $gene %in% ints_all$gene,]


Pers.D10.HM <- Persistant.meth_table.means.D10 %>% 
  ggplot(., aes(x = Initial.Treatment, y = gene, fill =avg)) + 
  geom_tile() + 
  scale_fill_viridis(discrete=FALSE)+
  #scale_fill_gradientn(colours = c("black", "green","cyan", "red", "orange", "yellow"), values = c(0,20, 40, 60, 80,100))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 5),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

#Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/Persist.D10.HM.pdf", Pers.D10.HM, width = 5, height = 30, units = c("in"))

#D135
Persistant.meth_table.means.D135 <- MD.ALL %>% filter(TimePoint=="Day135") %>%
  group_by(gene, Initial.Treatment) %>%
  summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth)))

Persistant.meth_table.means.D135  <- Persistant.meth_table.means.D135[Persistant.meth_table.means.D135$gene %in% ints_all$gene,]


Pers.D135.HM <- Persistant.meth_table.means.D135 %>% 
  ggplot(., aes(x = Initial.Treatment, y = gene, fill =avg)) + 
  geom_tile() + 
  scale_fill_viridis(discrete=FALSE)+
  #scale_fill_gradientn(colours = c("black", "green","cyan", "red", "orange", "yellow"), values = c(0,20, 40, 60, 80,100))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 5),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

#Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/Persist.D135.HM.pdf", Pers.D135.HM, width = 5, height = 30, units = c("in"))

#D145
Persistant.meth_table.means.D145 <- MD.ALL %>% filter(TimePoint=="Day145") %>%
  group_by(gene, Initial.Treatment, Secondary.Treatment) %>%
  summarise(avg = mean(per.meth), sem = sd(per.meth)/sqrt(length(per.meth)))

Persistant.meth_table.means.D145  <- Persistant.meth_table.means.D145[Persistant.meth_table.means.D145$gene %in% ints_all$gene,]

Persistant.meth_table.means.D145$group <- paste0(Persistant.meth_table.means.D145$Initial.Treatment, "_", Persistant.meth_table.means.D145$Secondary.Treatment)

Pers.D145.HM <- Persistant.meth_table.means.D145 %>% 
  ggplot(., aes(x = group, y = gene, fill =avg)) + 
  geom_tile() + 
  scale_fill_viridis(discrete=FALSE)+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 5),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 8),
  axis.title.x = element_blank(),
  axis.title.y = element_blank())

Persist.DMGs <- ggarrange(Pers.D10.HM,Pers.D135.HM, Pers.D145.HM, ncol=3, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/Persist.alltimes.HM.pdf", Persist.DMGs, width = 15, height = 30, units = c("in"))


```

```{r}
#obtain genes from sig.GO.D10, sig.GO.D135, sig.GO.D145
#plot geom_tile heatmap as above with only those genes faceted by goslim term

#add column to sig.GO dataframes to denote sampling time point
sig.GO.D10$TP <- "Day10"
sig.GO.D135$TP <- "Day135"
sig.GO.D145$TP <- "Day145"

#bind data frames together
byGO <- as.data.frame(rbind(sig.GO.D10, sig.GO.D135, sig.GO.D145))

#create list objects where each list item is a vector containing genes, uniprot ids, or taxa ids for each line 
split_byGO_gene <- strsplit(as.character(byGO$gene), "; ") #split into multiple gene ids
split_byGO_uniprot <- strsplit(as.character(byGO$uniprot), "; ") #split into multiple uniprot ids
split_byGO_taxa <- strsplit(as.character(byGO$uniprot_taxa.id), "; ") #split into multiple taxa ids

#create a new data frame that lists each gene, uniprot ID, and taxa id on a separate line (so the GO IDs are listed multiple times for each gene). 
#For the first data.frame Gene.byGO.IDs, I'm adding all the data from byGO and this will be maintained when it is merged with the uniprot info
Gene.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_gene, length)), over_represented_pvalue = rep.int(byGO$over_represented_pvalue, sapply(split_byGO_gene, length)), under_represented_pvalue = rep.int(byGO$under_represented_pvalue, sapply(split_byGO_gene, length)) , numDEInCat = rep.int(byGO$numDEInCat, sapply(split_byGO_gene, length)), numInCat = rep.int(byGO$numInCat, sapply(split_byGO_gene, length)), term = rep.int(byGO$term, sapply(split_byGO_gene, length)), ontology = rep.int(byGO$ontology, sapply(split_byGO_gene, length)), Cat = rep.int(byGO$Cat, sapply(split_byGO_gene, length)),GO.Slim.Term = rep.int(byGO$GO.Slim.Term, sapply(split_byGO_gene, length)),TP = rep.int(byGO$TP, sapply(split_byGO_gene, length)), gene = unlist(split_byGO_gene)) 

#create a dataframe containing the GO.IDs and uniprot IDs all listed on separate lines
unip.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_uniprot, length)), uniprot = unlist(split_byGO_uniprot)) 

#create a dataframe containing the GO.IDs and uniprot taxa IDs all listed on separate lines

taxa.byGO.IDs <- data.frame(GO.IDs = rep.int(byGO$GO.IDs, sapply(split_byGO_taxa, length)), uniprot_taxa.id = unlist(split_byGO_taxa)) 

#merge the dataframes with everything listed on separate lines
byGO_long <- cbind(Gene.byGO.IDs, "uniprot" =unip.byGO.IDs[,"uniprot"],uniprot_taxa.id =taxa.byGO.IDs[,"uniprot_taxa.id"])

#remove the species from the taxa id
byGO_long$uniprot_taxa.id <- gsub("_.*", "",byGO_long$uniprot_taxa.id)

#re-order the df by slim, timepoint, then pvalue
byGO_long  <- byGO_long[order(byGO_long$GO.Slim.Term,byGO_long$TP,byGO_long$over_represented_pvalue),]

# paste gene uniprot id next to terms
for (i in 1:nrow(byGO_long)){
  byGO_long$taxa.id[i] <- paste(unique(unname(byGO_long[grep(byGO_long$GO.IDs[i],byGO_long$GO.IDs),"uniprot_taxa.id"])), collapse = "; ")
  
}

byGO_long_term_gene <- unique(byGO_long[,-grep("^gene$|uniprot", colnames(byGO_long))])

byGO_long_term_gene$term_gene <- paste0(byGO_long_term_gene$term, " (",byGO_long_term_gene$taxa.id,")")


### add order column to order terms by p.value in heatmap
byGO_long_term_gene <- byGO_long_term_gene %>% arrange(GO.Slim.Term, over_represented_pvalue) %>% mutate(order = row_number())


###make the same order number for duplicate terms so that terms found across timepoints can be plotted in the same row
#extract the order number for the term with the lowest p.value
byGO_long_term_gene_rank <- byGO_long_term_gene %>% group_by(term_gene) %>% dplyr::slice(which.min(order))
#save only the term_gene and order columns so these can be merged with the main df
byGO_long_term_gene_rank <- byGO_long_term_gene_rank[,c("term_gene", "order")]
#rename the order column that will be merged
colnames(byGO_long_term_gene_rank) <- c("term_gene", "order2")                     
#merge the new order numbers with the main df                      
byGO_long_term_gene <- merge(byGO_long_term_gene, byGO_long_term_gene_rank, by = "term_gene")
                       
#plot all BP categories           
FigS5<-  byGO_long_term_gene %>% filter(ontology=="BP", TP!= "Persistent") %>%
  ggplot(., aes(x = TP, y = order2)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ ., scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 12),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 4),
  axis.title.x = element_blank(),
  axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2),
    labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) 

#Persist.GO.Sig <- ggarrange(PersistGO.BP,PersistGO.MF, ncol=2, common.legend = TRUE, legend="bottom")
ggsave(file="RAnalysis/Figs/FigS5_BP_all.pdf", FigS5, width = 10, height = 16, units = c("in"))


#subset GO slims to reduce heatmap size and focus on those that are not in NA slim or greater than 1-2 terms

Fig7<-  byGO_long_term_gene %>% filter(ontology=="BP", TP!= "Persistent",GO.Slim.Term %in% c("cell organization and biogenesis","signal transduction", "transport", "developmental processes", "protein metabolism")) %>%
  ggplot(., aes(x = TP, y = order2)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(factor(GO.Slim.Term, levels = c("cell organization and biogenesis","signal transduction", "developmental processes", "protein metabolism", "transport")) ~ .,scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 12),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 4),
  axis.title.x = element_blank(),
  axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2),
    labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) 

ggsave(file="RAnalysis/Figs/Fig7_BP_subset.pdf", Fig7, width = 8, height = 10, units = c("in"))


FigS6<-  byGO_long_term_gene %>% filter(ontology=="MF",TP!= "Persistent") %>%
  ggplot(., aes(x = TP, y = order2)) + 
  geom_tile(aes(fill =over_represented_pvalue)) + 
  facet_grid(GO.Slim.Term ~ .,scales = "free_y", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
  strip.text.y = element_text(angle=0, size = 12),
  strip.text.x = element_text(size = 20),
  axis.text = element_text(size = 4),
  axis.title.x = element_blank(),
  axis.title.y = element_blank()) + scale_y_continuous(breaks = unique(byGO_long_term_gene$order2),
    labels = unique(byGO_long_term_gene$term_gene),expand = c(0,0)) 

ggsave(file="RAnalysis/Figs/FigS6_MF_all.pdf", FigS6, width = 8, height = 20, units = c("in"))


```