--- title: "Clustering technical replicates" author: "Shelly Trigg" date: "1/11/2019" output: rmarkdown::github_document --- Load packages ```{r} library(vegan) library(ggplot2) library(dplyr) library(gtools) ``` Load Abacus data, parse out ADJNSAF values, and simplify column names to just sample number ```{r} #upload data file ABACUSdata <- read.csv("~/Documents/GitHub/OysterSeedProject/raw_data/ABACUS_output021417.tsv", sep = "\t", header=TRUE, stringsAsFactors = FALSE) #select only columns containing ADJNSAF and Protein ID ABACUSdata <- ABACUSdata[,c(1,grep("ADJNSAF", colnames(ABACUSdata)))] ## change column names in ABACUSdata to just sampleID colnames(ABACUSdata) <- gsub(pattern = "X20161205_SAMPLE_", "", colnames(ABACUSdata)) colnames(ABACUSdata) <- gsub(pattern = "_ADJNSAF", "", colnames(ABACUSdata)) ``` Load meta data file with temperature and day information ```{r} #upload meta data; this was a csv file I create from Rhonda's notebook entry: https://github.com/Ellior2/Ellior2.github.io/blob/master/_posts/2017-3-11-NMDS-analysis.md meta_data <- read.csv("~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/Rhonda_new_sample_names.csv", header = TRUE, stringsAsFactors = FALSE) meta_data$silo <- substr(meta_data$Contents,5,5) meta_data$day <- substr(meta_data$SampleName,5,6) meta_data$SampleName <- gsub(pattern = "H","",meta_data$SampleName) meta_data$SampleName <- gsub(pattern = "C","",meta_data$SampleName) #create a temperature column meta_data$temp <- "temp" for(i in 1:nrow(meta_data)){ if(meta_data$silo[i] == "2"){ meta_data$temp[i] <- "23" } if(meta_data$silo[i] == "3"){ meta_data$temp[i] <- "23" } if(meta_data$silo[i] == "9"){ meta_data$temp[i] <- "29" } if(meta_data$silo[i] == "e"){ meta_data$temp[i] <- "16" } } ``` Reformat Abacus data for NMDS ```{r} #Transpose- switch rows and columns tABACUSdata <- t.data.frame(ABACUSdata[,-1]) colnames(tABACUSdata) <- ABACUSdata[,1] tABACUSdata <- cbind(data.frame(rownames(tABACUSdata)),tABACUSdata) colnames(tABACUSdata)[1] <- "SampleID" #add meta data to abacus data tABACUSdata <- merge(meta_data[,c(1,2,7,8)],tABACUSdata, by = "SampleID") #Remove Silo 9 silo3and2 <- tABACUSdata[which(substr(tABACUSdata$SampleName,1,2) != "S9"),] #make rownames from Sample ID column so that the NMDS knows what's what rownames(silo3and2) <- silo3and2$SampleID #order the data frame by day and temperature so coloring the points on the plot is easier silo3and2 <- silo3and2[order(as.numeric(silo3and2$day),silo3and2$temp),] ``` Determine if any proteins have zero ADJNSAF vals for all samples; this would be because they were in Silo 2, but not in Silo 3 or 9 ```{r} no_val_proteins <- silo3and2[,which(apply(silo3and2, 2, var) == 0)] ncol(no_val_proteins) ``` Remove proteins if they have a zero value in all samples ```{r} silo3and2_nozerovar <- silo3and2[,-c(1:4,which(colnames(silo3and2) %in% colnames(no_val_proteins)))] #check to make sure it worked ncol(silo3and2)-ncol(silo3and2_nozerovar) ``` For proteins with a zero value in any sample, replace with very small value ```{r} silo3and2_nozerovar[silo3and2_nozerovar == 0.0000] <- 0.1000 ``` ```{r, echo = FALSE, eval=FALSE} write.csv(cbind(silo3and2$SampleID, silo3and2$SampleName,silo3and2$day, silo3and2$temp,silo3and2_nozerovar), "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals.csv", row.names = FALSE, quote = FALSE) ``` try PCA ```{r} #first convert day and temp data to factors for plotting silo3and2$day <- factor(silo3and2$day, levels = unique(silo3and2$day)) silo3and2$temp <- factor(silo3and2$temp, levels = unique(silo3and2$temp)) silo3and2$silo <- factor(substr(silo3and2$SampleName,0,2)) #run PCA pca <- prcomp(silo3and2_nozerovar, center = T, scale = T) pca_meta <- data.frame(cbind(silo3and2[,c("day","temp","silo")],pca$x)) ggplot(pca_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo)) + theme_bw() + ggtitle("PCA of ADJNSAF values where zeros were replaced with 0.1") ``` try PCA on log transformed values ```{r} silo3and2_log <- log(silo3and2_nozerovar,2) pca_log <- prcomp(silo3and2_log, center = F, scale = F) pca_log_meta <- data.frame(cbind(silo3and2[,c("day","temp","silo")],pca$x)) colnames(pca_log_meta)[1:3] <- c("day","temp","silo") ggplot(pca_log_meta, aes(PC1, PC2)) + geom_point(aes(col = day, shape = silo)) + theme_bw() + ggtitle("PCA of log ADJNSAF values with zeros replaced with 0.1") ``` boxplots of common contaminant proteins; trypsin shows generally consistent levels as we'd expect ```{r} boxplot(silo3and2_nozerovar[,-grep("CHOYP", colnames(silo3and2_nozerovar))]) ``` Average NSAF values for all proteins ```{r} df_all_avg <- data.frame() #loop through the data and calculate the mean between replicates for each protein for (i in seq(1,nrow(silo3and2_nozerovar),2)){ #this calculates the mean for each odd number row and the row following it df_all_avg_row <- apply(silo3and2_nozerovar[c(i,i+1),],2,mean) #this sequencially combines rows of data together after the SD is generated df_all_avg <- rbind(df_all_avg, df_all_avg_row) } #add column names to SD data colnames(df_all_avg) <- colnames(silo3and2_nozerovar) rownames(df_all_avg) <- rownames(silo3and2_nozerovar[-grep("A",rownames(silo3and2_nozerovar)),]) ``` export data set avg tech. rep. NSAF for all proteins ```{r,echo=FALSE, eval=FALSE} new_data <- cbind(silo3and2[-grep("A",rownames(silo3and2)),c("day","temp","silo")],df_all_avg) new_data <- new_data[order(new_data$day, new_data$temp, new_data$silo),] write.csv(new_data, "~/Documents/GitHub/OysterSeedProject/analysis/nmds_R/silo3and2_nozerovals_AVGs.csv", quote = FALSE, row.names = FALSE) ```