---
title: "ASCA on average NSAF values of all proteins"
author: "Shelly Trigg"
date: "1/17/2019"
output: rmarkdown::github_document
---

load libraries
```{r, echo = FALSE}
library(dplyr)
library(tidyr)
library(MetStaT)
library(ggplot2)
library(heatmap3)
```

load data
```{r}
#NSAF data from filtered proteins
data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals_AVGs.csv", stringsAsFactors = FALSE)
```

```{r, echo =FALSE}
#load data with annotations so protein IDs can be substituted with shorter names
#data with uniprot annotations
data_w_uniprot <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/gigaton-uniport-table_GOterms-sperated.csv", stringsAsFactors = FALSE)
#subset only Protein IDs and Entry names
data_w_uniprot <- data_w_uniprot[,c("Protein.ID", "Entry.name")]
#replace pipe character with nothing; remember \\ to escape the | character
data_w_uniprot$Protein.ID <- gsub("\\|",".", data_w_uniprot$Protein.ID)
#replace all characters following '_' with nothing in the entry name column
data_w_uniprot$Entry.name.simple <- gsub("\\_.*","", data_w_uniprot$Entry.name)
#replace 'none' with 'unknown'
data_w_uniprot$Entry.name.simple <- gsub("None","Unknown", data_w_uniprot$Entry.name.simple)

#there are duplicated Entry names for unique protein IDs, so we need to make unique Entry names by adding .1, .2 etc.
#https://stackoverflow.com/questions/16646446/renaming-duplicate-strings-in-r

data_w_uniprot$Entry.name.new <- make.names(data_w_uniprot$Entry.name.simple, unique = TRUE)
```

```{r subset_data_for_CHOYP_only, echo = FALSE}
#make a dataframe with just CHOYP proteins
choyp_data <- data[,c(1:3,grep("CHOYP", colnames(data)))]

#make a seperate data frame with contaminant proteins
contam_data <- data[,-grep("CHOYP", colnames(data))]
```

**Perform ASCA**
```{r}
#create matrix to pass to ASCA command, excluding the silo and time info
ASCA_X <- as.matrix(choyp_data[,-c(1:3)])
#create matrix to pass to ASCA command with only the silo and time info
ASCA_F <- choyp_data[,c(1,3)]
ASCA_F$silo <- as.numeric(as.factor(ASCA_F$silo))
ASCA_F <- as.matrix(ASCA_F)
#perform ASCA
ASCA <- ASCA.Calculate(ASCA_X, ASCA_F, equation.elements = "1,2,12", scaling = FALSE)
```

Here is a summary of the ASCA results (e.g. variance explained by different factors; factor 1= time (days), factor 2 = silo, interaction = interaction of time and silo)
```{r}
#print the ASCA summary
ASCA.GetSummary(ASCA)
```

### Plot PCAs from ASCA  

**This first plot is the time (days) effect PCA**
```{r avgNSAF_PCA_timeEffect_plot}
#plot PCA for factor 1, which is time in this case
ASCA.PlotScoresPerLevel(ASCA, ee = "1", pcs = "1,2")
```

**This next plot is the silo effect PCA**
```{r avgNSAF_PCA_siloEffect_plot}
#plot PCA for factor 2, which is silo in this case
ASCA.PlotScoresPerLevel(ASCA, ee = "2", pcs = "1,2")
```

**This next plot is the time x silo interaction effect PCA**
```{r avgNSAF_PCA_timeXsiloEffect_plot}
#plot PCA for factor interaction, which is time x silo in this case
timexsilo_PC12 <- data.frame(ASCA$`12`$svd$t[,c(1,2)])
timexsilo_PC12 <- cbind(data.frame(ASCA$`12`$level.combinations$row.patterns), timexsilo_PC12)
colnames(timexsilo_PC12)<- c("day","silo","PC1","PC2")
timexsilo_PC12$day <- as.character(timexsilo_PC12$day)
timexsilo_PC12$silo <- as.character(timexsilo_PC12$silo)
ggplot(timexsilo_PC12, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo, size = 3)) + theme_bw() + ggtitle("PC1 vs PC2 for time x silo interaction effect") + theme(plot.title = element_text(face = "bold")) + xlab(paste("PC1"," (",formatC(ASCA$`12`$svd$var.explained[1] * 100,digits=2,format="f"),"%)", sep = "")) + ylab(paste("PC2"," (",formatC(ASCA$`12`$svd$var.explained[2] * 100,digits=2,format="f"),"%)", sep = ""))
```

### Analysis of proteins affected by silo

Because the silo effect PCA show the most separation between 23C and 29C in PC2, we will look at those loadings.

```{r avgNSAF_PCA_siloEffect_PC12loadings, echo = FALSE}
#extract protein names from ASCA data; these will be combined with loadings values; the order is maintained
protnames <- data.frame(colnames(ASCA$data))
```

combine protein names with ASCA loadings for PC1 and PC2 for silo

```{r avgNSAF_PCA_timeEffect_PC1loadings, echo = FALSE}
#make dataframes of factor 1 (time) PC1 loadings
d1 <- cbind(protnames, ASCA$`2`$svd$v[,1])
colnames(d1) <- c("protein", "PC1loadings")
d1_great <- d1[which(d1$PC1loadings > 0),]
d1_less <- d1[which(d1$PC1loadings < 0),]

plot(d1_great[order(d1_great$PC1loadings, decreasing = TRUE),2])
plot(d1_less[order(d1_less$PC1loadings, decreasing = TRUE),2])
```

**PC2 loadings for the time effect PCA**
```{r avgNSAF_PCA_siloEffect_PC2loadings, echo = FALSE}
d2 <- cbind(protnames, ASCA$`2`$svd$v[,2])
colnames(d2) <- c("protein", "PC2loadings")
d2_great <- d2[which(d2$PC2loadings > 0),]
d2_less <- d2[which(d2$PC2loadings < 0),]

plot(d2_great[order(d2_great$PC2loadings, decreasing = TRUE),2])
plot(d2_less[order(d2_less$PC2loadings, decreasing = TRUE),2])

#make list of proteins with silo PC1 loadings values >= 0.025
cutd1 <- data.frame(d1[which(abs(d1$PC1loadings) >= 0.025),1])
colnames(cutd1) <- "protein"
cutd2 <- data.frame(d2[which(abs(d2$PC2loadings) >= 0.025),1])
colnames(cutd2) <- "protein"
cutd12 <- unique(rbind(cutd1,cutd2))
```

To pull out proteins affected by silo based on their influence in seperating treatment groups on the silo PCA, I picked an absolute value loadings threshold of 0.025. This means any protein that had a loadings value > 0.025 or < -0.025 was selected.

**Heatmap of proteins affected by silo, time points are side-by-side**
```{r avgNSAF_siloEffect_cutoff0.025_heatmap_OrderedByTime_ShortNames, echo = FALSE, fig.height=15, fig.width=15}

#make a list of cutoff proteins with normalized abundance
cut_data <-data[,which(colnames(data) %in% cutd12$protein)]
rownames(cut_data) <- paste(data$day, data$silo, sep="_")
heatmap3(t.data.frame(cut_data),Colv = NA,cexRow = 0.3)
```

```{r, echo = FALSE, eval = FALSE}
#
write.csv(cut_data, "~/Documents/GitHub/OysterSeedProject/analysis/Silo2vs3/ASCA_SiloAffectedProteins.csv", quote = FALSE, row.names = FALSE)
```

Number of proteins affected by silo at loadings value > 0.025 or < -0.025
```{r, echo = FALSE}
nrow(cutd12)
```

**Heatmap of proteins affected by silo, ordered by silo and then by time**
```{r avgNSAF_siloEffect_cutoff0.025_heatmap_OrderedBysilo_ShortNames,fig.height=15, fig.width=15, echo = FALSE}
#reorder the cut data so that samples are grouped by silo
cut_data_ord <- data[,c(1:3,which(colnames(data) %in% cutd12$protein))]
cut_data_ord <- cut_data_ord[order(cut_data_ord$silo,cut_data_ord$day),]
rownames(cut_data_ord) <- paste(cut_data_ord$day, cut_data_ord$silo, sep="_")
cut_data_ord <- cut_data_ord[,-c(1:3)]
#replace CHOYP names with "entry names"
#transposed to eventually get row names as columns to match entry names to. 
#t.data.frame returns a matrix so I had convert back to data frame for all downstream rearranging because data frames are much easier to work with
cut_data_ord_t <- data.frame(t.data.frame(cut_data_ord))
#make row names which contain the protein IDs a column
cut_data_ord_t <- cbind(rownames(cut_data_ord_t), cut_data_ord_t)
#add the column name
colnames(cut_data_ord_t)[1] <- "Protein.ID"
#convert the Protein ID column to character
cut_data_ord_t[,1] <- as.character(cut_data_ord_t[,1])
#add the Entry names
cut_data_ord_t <- merge(data_w_uniprot[,c("Protein.ID","Entry.name.new")],cut_data_ord_t, by = "Protein.ID")
#make the entry names the row names
rownames(cut_data_ord_t) <- cut_data_ord_t[,"Entry.name.new"]
#remove the protein ID and entry name columns since the row names are added
cut_data_ord_t <- cut_data_ord_t[,-c(1:2)]
#this step removes the x from the column names while preserving the order of colnames
colnames(cut_data_ord_t) <- rownames(cut_data_ord)
#plot the heat map
heatmap3(as.matrix(cut_data_ord_t), Colv = NA, cexRow = 0.3)
```