--- title: "ASCA on average NSAF values of all proteins" author: "Shelly Trigg" date: "1/17/2019" output: rmarkdown::github_document --- load libraries ```{r, echo = FALSE} library(dplyr) library(tidyr) library(MetStaT) library(ggplot2) library(heatmap3) ``` load data ```{r} #NSAF data from filtered proteins data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals_AVGs.csv", stringsAsFactors = FALSE) ``` ```{r, echo =FALSE} #load data with annotations so protein IDs can be substituted with shorter names #data with uniprot annotations data_w_uniprot <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/gigaton-uniport-table_GOterms-sperated.csv", stringsAsFactors = FALSE) #subset only Protein IDs and Entry names data_w_uniprot <- data_w_uniprot[,c("Protein.ID", "Entry.name")] #replace pipe character with nothing; remember \\ to escape the | character data_w_uniprot$Protein.ID <- gsub("\\|",".", data_w_uniprot$Protein.ID) #replace all characters following '_' with nothing in the entry name column data_w_uniprot$Entry.name.simple <- gsub("\\_.*","", data_w_uniprot$Entry.name) #replace 'none' with 'unknown' data_w_uniprot$Entry.name.simple <- gsub("None","Unknown", data_w_uniprot$Entry.name.simple) #there are duplicated Entry names for unique protein IDs, so we need to make unique Entry names by adding .1, .2 etc. #https://stackoverflow.com/questions/16646446/renaming-duplicate-strings-in-r data_w_uniprot$Entry.name.new <- make.names(data_w_uniprot$Entry.name.simple, unique = TRUE) ``` ```{r subset_data_for_CHOYP_only, echo = FALSE} #make a dataframe with just CHOYP proteins choyp_data <- data[,c(1:3,grep("CHOYP", colnames(data)))] #make a seperate data frame with contaminant proteins contam_data <- data[,-grep("CHOYP", colnames(data))] ``` **Perform ASCA** ```{r} #create matrix to pass to ASCA command, excluding the silo and time info ASCA_X <- as.matrix(choyp_data[,-c(1:3)]) #create matrix to pass to ASCA command with only the silo and time info ASCA_F <- choyp_data[,c(1,3)] ASCA_F$silo <- as.numeric(as.factor(ASCA_F$silo)) ASCA_F <- as.matrix(ASCA_F) #perform ASCA ASCA <- ASCA.Calculate(ASCA_X, ASCA_F, equation.elements = "1,2,12", scaling = FALSE) ``` Here is a summary of the ASCA results (e.g. variance explained by different factors; factor 1= time (days), factor 2 = silo, interaction = interaction of time and silo) ```{r} #print the ASCA summary ASCA.GetSummary(ASCA) ``` ### Plot PCAs from ASCA **This first plot is the time (days) effect PCA** ```{r avgNSAF_PCA_timeEffect_plot} #plot PCA for factor 1, which is time in this case ASCA.PlotScoresPerLevel(ASCA, ee = "1", pcs = "1,2") ``` **This next plot is the silo effect PCA** ```{r avgNSAF_PCA_siloEffect_plot} #plot PCA for factor 2, which is silo in this case ASCA.PlotScoresPerLevel(ASCA, ee = "2", pcs = "1,2") ``` **This next plot is the time x silo interaction effect PCA** ```{r avgNSAF_PCA_timeXsiloEffect_plot} #plot PCA for factor interaction, which is time x silo in this case timexsilo_PC12 <- data.frame(ASCA$`12`$svd$t[,c(1,2)]) timexsilo_PC12 <- cbind(data.frame(ASCA$`12`$level.combinations$row.patterns), timexsilo_PC12) colnames(timexsilo_PC12)<- c("day","silo","PC1","PC2") timexsilo_PC12$day <- as.character(timexsilo_PC12$day) timexsilo_PC12$silo <- as.character(timexsilo_PC12$silo) ggplot(timexsilo_PC12, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo, size = 3)) + theme_bw() + ggtitle("PC1 vs PC2 for time x silo interaction effect") + theme(plot.title = element_text(face = "bold")) + xlab(paste("PC1"," (",formatC(ASCA$`12`$svd$var.explained[1] * 100,digits=2,format="f"),"%)", sep = "")) + ylab(paste("PC2"," (",formatC(ASCA$`12`$svd$var.explained[2] * 100,digits=2,format="f"),"%)", sep = "")) ``` ### Analysis of proteins affected by silo Because the silo effect PCA show the most separation between 23C and 29C in PC2, we will look at those loadings. ```{r avgNSAF_PCA_siloEffect_PC12loadings, echo = FALSE} #extract protein names from ASCA data; these will be combined with loadings values; the order is maintained protnames <- data.frame(colnames(ASCA$data)) ``` combine protein names with ASCA loadings for PC1 and PC2 for silo ```{r avgNSAF_PCA_timeEffect_PC1loadings, echo = FALSE} #make dataframes of factor 1 (time) PC1 loadings d1 <- cbind(protnames, ASCA$`2`$svd$v[,1]) colnames(d1) <- c("protein", "PC1loadings") d1_great <- d1[which(d1$PC1loadings > 0),] d1_less <- d1[which(d1$PC1loadings < 0),] plot(d1_great[order(d1_great$PC1loadings, decreasing = TRUE),2]) plot(d1_less[order(d1_less$PC1loadings, decreasing = TRUE),2]) ``` **PC2 loadings for the time effect PCA** ```{r avgNSAF_PCA_siloEffect_PC2loadings, echo = FALSE} d2 <- cbind(protnames, ASCA$`2`$svd$v[,2]) colnames(d2) <- c("protein", "PC2loadings") d2_great <- d2[which(d2$PC2loadings > 0),] d2_less <- d2[which(d2$PC2loadings < 0),] plot(d2_great[order(d2_great$PC2loadings, decreasing = TRUE),2]) plot(d2_less[order(d2_less$PC2loadings, decreasing = TRUE),2]) #make list of proteins with silo PC1 loadings values >= 0.025 cutd1 <- data.frame(d1[which(abs(d1$PC1loadings) >= 0.025),1]) colnames(cutd1) <- "protein" cutd2 <- data.frame(d2[which(abs(d2$PC2loadings) >= 0.025),1]) colnames(cutd2) <- "protein" cutd12 <- unique(rbind(cutd1,cutd2)) ``` To pull out proteins affected by silo based on their influence in seperating treatment groups on the silo PCA, I picked an absolute value loadings threshold of 0.025. This means any protein that had a loadings value > 0.025 or < -0.025 was selected. **Heatmap of proteins affected by silo, time points are side-by-side** ```{r avgNSAF_siloEffect_cutoff0.025_heatmap_OrderedByTime_ShortNames, echo = FALSE, fig.height=15, fig.width=15} #make a list of cutoff proteins with normalized abundance cut_data <-data[,which(colnames(data) %in% cutd12$protein)] rownames(cut_data) <- paste(data$day, data$silo, sep="_") heatmap3(t.data.frame(cut_data),Colv = NA,cexRow = 0.3) ``` ```{r, echo = FALSE, eval = FALSE} # write.csv(cut_data, "~/Documents/GitHub/OysterSeedProject/analysis/Silo2vs3/ASCA_SiloAffectedProteins.csv", quote = FALSE, row.names = FALSE) ``` Number of proteins affected by silo at loadings value > 0.025 or < -0.025 ```{r, echo = FALSE} nrow(cutd12) ``` **Heatmap of proteins affected by silo, ordered by silo and then by time** ```{r avgNSAF_siloEffect_cutoff0.025_heatmap_OrderedBysilo_ShortNames,fig.height=15, fig.width=15, echo = FALSE} #reorder the cut data so that samples are grouped by silo cut_data_ord <- data[,c(1:3,which(colnames(data) %in% cutd12$protein))] cut_data_ord <- cut_data_ord[order(cut_data_ord$silo,cut_data_ord$day),] rownames(cut_data_ord) <- paste(cut_data_ord$day, cut_data_ord$silo, sep="_") cut_data_ord <- cut_data_ord[,-c(1:3)] #replace CHOYP names with "entry names" #transposed to eventually get row names as columns to match entry names to. #t.data.frame returns a matrix so I had convert back to data frame for all downstream rearranging because data frames are much easier to work with cut_data_ord_t <- data.frame(t.data.frame(cut_data_ord)) #make row names which contain the protein IDs a column cut_data_ord_t <- cbind(rownames(cut_data_ord_t), cut_data_ord_t) #add the column name colnames(cut_data_ord_t)[1] <- "Protein.ID" #convert the Protein ID column to character cut_data_ord_t[,1] <- as.character(cut_data_ord_t[,1]) #add the Entry names cut_data_ord_t <- merge(data_w_uniprot[,c("Protein.ID","Entry.name.new")],cut_data_ord_t, by = "Protein.ID") #make the entry names the row names rownames(cut_data_ord_t) <- cut_data_ord_t[,"Entry.name.new"] #remove the protein ID and entry name columns since the row names are added cut_data_ord_t <- cut_data_ord_t[,-c(1:2)] #this step removes the x from the column names while preserving the order of colnames colnames(cut_data_ord_t) <- rownames(cut_data_ord) #plot the heat map heatmap3(as.matrix(cut_data_ord_t), Colv = NA, cexRow = 0.3) ```