---
title: "Clustering technical replicates"
author: "Shelly Trigg"
date: "1/11/2019"
output: rmarkdown::github_document
---

Load packages
```{r}
library(vegan)
library(ggplot2)
library(dplyr)
library(gtools)
```

Load Abacus data, parse out ADJNSAF values, and simplify column names to just sample number
```{r}
#upload data file
ABACUSdata <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/ABACUS_output021417.tsv", sep = "\t", header=TRUE, stringsAsFactors = FALSE)
#select only columns containing ADJNSAF and Protein ID
ABACUSdata <- ABACUSdata[,c(1,grep("ADJNSAF", colnames(ABACUSdata)))]

## change column names in ABACUSdata to just sampleID
colnames(ABACUSdata) <- gsub(pattern = "X20161205_SAMPLE_", "", colnames(ABACUSdata))
colnames(ABACUSdata) <- gsub(pattern = "_ADJNSAF", "", colnames(ABACUSdata))
```

Load meta data file with temperature and day information
```{r}
#upload meta data; this was a csv file I create from Rhonda's notebook entry: https://github.com/Ellior2/Ellior2.github.io/blob/master/_posts/2017-3-11-NMDS-analysis.md
meta_data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/Rhonda_new_sample_names.csv", header = TRUE, stringsAsFactors = FALSE)
meta_data$silo <- substr(meta_data$Contents,5,5)
meta_data$day <- substr(meta_data$SampleName,5,6)
meta_data$SampleName <- gsub(pattern = "H","",meta_data$SampleName)
meta_data$SampleName <- gsub(pattern = "C","",meta_data$SampleName)
#create a temperature column
meta_data$temp <- "temp"
for(i in 1:nrow(meta_data)){
  if(meta_data$silo[i] == "2"){
    meta_data$temp[i] <- "23"
  }
  if(meta_data$silo[i] == "3"){
    meta_data$temp[i] <- "23"
  }
  if(meta_data$silo[i] == "9"){
    meta_data$temp[i] <- "29"
  }
  if(meta_data$silo[i] == "e"){
    meta_data$temp[i] <- "16"
  }
}
```

Reformat Abacus data for NMDS
```{r}
#Transpose- switch rows and columns
tABACUSdata <- t.data.frame(ABACUSdata[,-1])
colnames(tABACUSdata) <- ABACUSdata[,1]
tABACUSdata <- cbind(data.frame(rownames(tABACUSdata)),tABACUSdata)
colnames(tABACUSdata)[1] <- "SampleID"

#add meta data to abacus data
tABACUSdata <- merge(meta_data[,c(1,2,7,8)],tABACUSdata, by = "SampleID")

#Remove Silo 9 
silo3and2 <- tABACUSdata[which(substr(tABACUSdata$SampleName,1,2) != "S9"),]
#make rownames from Sample ID column so that the NMDS knows what's what
rownames(silo3and2) <- silo3and2$SampleID
#order the data frame by day and temperature so coloring the points on the plot is easier
silo3and2 <- silo3and2[order(as.numeric(silo3and2$day),silo3and2$temp),]
```

Determine if any proteins have zero ADJNSAF vals for all samples; this would be because they were in Silo 2, but not in Silo 3 or 9
```{r}
no_val_proteins <- silo3and2[,which(apply(silo3and2, 2, var) == 0)]
ncol(no_val_proteins)
```

Remove proteins if they have a zero value in all samples
```{r}
silo3and2_nozerovar <- silo3and2[,-c(1:4,which(colnames(silo3and2) %in% colnames(no_val_proteins)))]
#check to make sure it worked
ncol(silo3and2)-ncol(silo3and2_nozerovar)
```

For proteins with a zero value in any sample, replace with very small value
```{r}
silo3and2_nozerovar[silo3and2_nozerovar == 0.0000] <- 0.1000
```

```{r, echo = FALSE, eval=FALSE}
write.csv(cbind(silo3and2$SampleID, silo3and2$SampleName,silo3and2$day, silo3and2$temp,silo3and2_nozerovar), "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals.csv", row.names = FALSE, quote = FALSE)
```

try PCA

```{r}
#first convert day and temp data to factors for plotting
silo3and2$day <- factor(silo3and2$day, levels = unique(silo3and2$day))
silo3and2$temp <- factor(silo3and2$temp, levels = unique(silo3and2$temp))
silo3and2$silo <- factor(substr(silo3and2$SampleName,0,2))

#run PCA
pca <- prcomp(silo3and2_nozerovar, center = T, scale = T)
pca_meta <- data.frame(cbind(silo3and2[,c("day","temp","silo")],pca$x))
ggplot(pca_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo)) + theme_bw() + ggtitle("PCA of ADJNSAF values where zeros were replaced with 0.1")
```

try PCA on log transformed values
```{r}
silo3and2_log <- log(silo3and2_nozerovar,2)
pca_log <- prcomp(silo3and2_log, center = F, scale = F)
pca_log_meta <-  data.frame(cbind(silo3and2[,c("day","temp","silo")],pca$x))
colnames(pca_log_meta)[1:3] <- c("day","temp","silo")
ggplot(pca_log_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo)) + theme_bw() + ggtitle("PCA of log ADJNSAF values with zeros replaced with 0.1")
```

boxplots of common contaminant proteins; trypsin shows generally consistent levels as we'd expect
```{r}
boxplot(silo3and2_nozerovar[,-grep("CHOYP", colnames(silo3and2_nozerovar))])
```

Average NSAF values for all proteins
```{r}
df_all_avg <- data.frame()
#loop through the data and calculate the mean between replicates for each protein
for (i in seq(1,nrow(silo3and2_nozerovar),2)){
  #this calculates the mean for each odd number row and the row following it
  df_all_avg_row <- apply(silo3and2_nozerovar[c(i,i+1),],2,mean)
  #this sequencially combines rows of data together after the SD is generated
  df_all_avg <- rbind(df_all_avg, df_all_avg_row)
}
#add column names to SD data
colnames(df_all_avg) <- colnames(silo3and2_nozerovar)
rownames(df_all_avg) <- rownames(silo3and2_nozerovar[-grep("A",rownames(silo3and2_nozerovar)),])

```

export data set avg tech. rep. NSAF for all proteins 
```{r,echo=FALSE, eval=FALSE}
new_data <- cbind(silo3and2[-grep("A",rownames(silo3and2)),c("day","temp","silo")],df_all_avg)
new_data <- new_data[order(new_data$day, new_data$temp, new_data$silo),]
write.csv(new_data, "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals_AVGs.csv", quote = FALSE, row.names = FALSE)
```