# Load libraries library('SNFtool') library('RColorBrewer') library('reshape2') library('ggplot2') setwd("< FILL THIS WITH YOUR DIRECTORY >") ## Load data load('CBW-GenomicMedicine-Module7-data.RData') ## What's in your environment now? ls() ## You should have loaded ## Looking at dimensions of the data we're integrating: mRNA, methylation, and miRNA dim(mrna) dim(methyl) dim(mirna) ## Looking at the data we're integrating (looking at first 6 rows and first 6 columns) mrna[1:6,1:6] # Genes labelled numerically -- something you'll have to deal with eventually methyl[1:6,1:6] # Methylation levels averaged over gene mirna[1:6,1:6] # miRNA ## Check that each dataset has the same patients identical(names(mrna),names(methyl)) identical(names(mrna),names(mirna)) ## Put datasets into a list data_list <- list(mrna,methyl,mirna) names(data_list) <- c("mRNA","Methyl","miRNA") ##**************************************************## ## ## ## Similarity Network Fusion ## ## ## ##**************************************************## ## First, set all the parameters: K = 20; ##number of neighbors, must be greater than 1. usually (10~30) alpha = 0.5; ##hyperparameter, usually (0.3~0.8) T = 20; ###Number of Iterations, usually (10~50) ## Standard normalize each data type data_norm_list <- lapply(< DATA LIST OBJECT >,standardNormalization) ## Create distance matrices for each data type data_dist_list <- lapply(X = < NORMALIZED DATA LIST OBJECT >, function(x){dist2(t(x),t(x))}) ## Create affinity matrices for each data type data_aff_list <- lapply(X = < DATA DISTANCE LIST OBJECT >, function(x){affinityMatrix(x,K,alpha)}) ## Get clusters for each data type (we're doing this simply/quickly since it's just a check) # Estimate number of clusters (cluster_nums <- lapply(X = < DATA AFFINITY LIST OBJECT >, function(x){estimateNumberOfClustersGivenGraph(W = x,NUMC = 2:10)[[1]]})) # Cluster data clusters <- sapply(1:3, function(i){spectralClustering(affinity = data_aff_list[[i]],K = cluster_nums[[i]])}) colnames(clusters) <- c("mRNA","Methyl","miRNA") # Looking closer at clusters apply(clusters,2,table) ## Create heatmaps for each data type displayClustersWithHeatmap(W = data_aff_list[["mRNA"]], group = clusters[,"mRNA"], col = brewer.pal(name = "RdGy",n = 10)) displayClustersWithHeatmap(W = data_aff_list[["Methyl"]], group = clusters[,"Methyl"], col = brewer.pal(name = "RdGy",n = 10)) displayClustersWithHeatmap(W = data_aff_list[["miRNA"]], group = clusters[,"miRNA"], col = brewer.pal(name = "RdGy",n = 10)) ## Running SNF algorithm W = SNF(< DATA AFFINITY LIST >,K,T) # Add column and row names to SNF affinity matrix colnames(W) <- rownames(W) <- colnames(mirna) ## Choosing number of clusters using eigen-gaps and rotation cost algorithms estimateNumberOfClustersGivenGraph(W = < AFFINITY MATRIX OBJECT >,2:10) ## Both eigen-gaps [[1]] and rotation cost [[2]] ## indicate that 2 is the best number of clusters # Perform clustering on the fused network clustering2 = spectralClustering(< AFFINITY MATRIX OBJECT >,2) # Look at distribution of group membership # With table table(clustering2) # With barplot barplot(table(clustering2),col = c('darkorchid4','dodgerblue4')) # Create dataframe for cluster 2 cluster2.df <- data.frame(cbind(colnames(mrna),clustering2)) names(cluster2.df) <- c("id","cluster") # Heatmap of fused matrix displayClustersWithHeatmap(W = < AFFINITY MATRIX OBJECT >,group = < 2 CLUSTERING OBJECT >, col = brewer.pal(name = "Spectral",n = 10)) # Can check out how clustering of 3 and 5 clustering3 = spectralClustering(W,3) displayClustersWithHeatmap(W = W,group = clustering3,col = brewer.pal(name = "RdGy",n = 10)) clustering5 = spectralClustering(W,5) displayClustersWithHeatmap(W = W,group = clustering5,col = brewer.pal(name = "RdGy",n = 10)) ##**************************************************## ## ## ## ASSESSING CLINICAL ATTRIBUTES OF CLUSTERS ## ## ## ##**************************************************## ## ## LOOKING AT DISTRIBUTION OF Basal, LumA, and LumB subtypes ## head(brca.subtype.data) brca.subtype.data$basal <- "Not Basal" brca.subtype.data$basal[substr(x = brca.subtype.data$V2,start = 1, stop = 9) == "ER-/HER2-"] <- "Basal" brca.subtype.data$luma <- "Not LumA" brca.subtype.data$luma[substr(x = brca.subtype.data$V2,start = 11, stop = nchar(brca.subtype.data$V2)) == "Low Prolif"] <- "LumA" brca.subtype.data$lumb <- "Not LumB" brca.subtype.data$lumb[substr(x = brca.subtype.data$V2,start = 11, stop = nchar(brca.subtype.data$V2)) == "High Prolif"] <- "LumB" head(cluster2.df) cluster2.df$id2 <- substr(x = cluster2.df$id,start = 9,stop = 12) brca.subtype.cluster2.merge <- merge(brca.subtype.data, cluster2.df, by.x = 'V1', by.y = 'id2') head(brca.subtype.cluster2.merge) table(brca.subtype.cluster2.merge$basal, brca.subtype.cluster2.merge$cluster) table(brca.subtype.cluster2.merge$luma,brca.subtype.cluster2.merge$cluster) table(brca.subtype.cluster2.merge$lumb,brca.subtype.cluster2.merge$cluster) chisq.test(table(brca.subtype.cluster2.merge$basal, brca.subtype.cluster2.merge$cluster)) chisq.test(table(brca.subtype.cluster2.merge$luma,brca.subtype.cluster2.merge$cluster)) chisq.test(table(brca.subtype.cluster2.merge$lumb,brca.subtype.cluster2.merge$cluster)) ## Simple barplot barplot(table(brca.subtype.cluster2.merge$basal, brca.subtype.cluster2.merge$cluster)) ## Colorful barplots side-by-side par(mfrow = c(1,3)) # this plots 1 row and 3 columns of graphs (3 graphs side by side) barplot(table(brca.subtype.cluster2.merge$basal,brca.subtype.cluster2.merge$cluster), main = "Basal", xlab = 'cluster', col = c('tomato',rgb(red=255, green=99, blue=71, alpha=90, names = NULL, maxColorValue = 255))) barplot(table(brca.subtype.cluster2.merge$luma,brca.subtype.cluster2.merge$cluster), main = "LumA", xlab = 'cluster', col = c('darkturquoise',rgb(red=0, green=206, blue=209, alpha=60, names = NULL, maxColorValue = 255))) barplot(table(brca.subtype.cluster2.merge$lumb,brca.subtype.cluster2.merge$cluster), main = "LumB", xlab = 'cluster', col = c('seagreen4',rgb(red=46, green=139, blue=87, alpha=90, names = NULL, maxColorValue = 255))) ## cluster 2 appears to be primarily basal with some lumB par(mfrow = c(1,1)) ## change back to single frame graph ## ## BONUS: CREATE PIE CHART OF SUBTYPES IN CLUSTER 2 ## # First, create data frame of cluster 2 only cluster2only.df <- brca.subtype.cluster2.merge[brca.subtype.cluster2.merge$cluster == 2,] # Have a look at it head(cluster2only.df) # Create new variable indicating the subtype cluster2only.df$Subtype <- 'No subtype' cluster2only.df$Subtype[cluster2only.df$basal == "Basal"] <- "Basal" # Use the three subtype variables to name subtypes in a single variable cluster2only.df$Subtype[cluster2only.df$luma == "LumA"] <- "LumA" cluster2only.df$Subtype[cluster2only.df$lumb == "LumB"] <- "LumB" # Frquencies of each subtype in cluster 2 table(cluster2only.df$Subtype) # Relative frequencies table(cluster2only.df$Subtype)/sum(table(cluster2only.df$Subtype)) # Put frequencies in a pie chart pie(table(cluster2only.df$Subtype),labels = names(table(cluster2only.df$Subtype)),col = brewer.pal("Dark2",n = 4)) ## ## BONUS: LOOKING AT DISTRIBUTION OF OTHER CLINICAL VARIABLES ## # Merge our other clinical set with the cluster data frame clinical.clust.df <- merge(clinical.data,cluster2.df,by.x = 'sample_id',by.y = 'id') head(clinical.clust.df) # Look at counts for different histological types table(clinical.clust.df$histological_type,clinical.clust.df$cluster) # Test if age of diagnosis confers with cluster membership t.test(age_at_initial_pathologic_diagnosis ~ cluster,clinical.clust.df) # Plot age of onset for each cluster ggplot(clinical.clust.df,aes(age_at_initial_pathologic_diagnosis,fill = cluster)) + xlab('Age at Initial Pathological Diagnosis') + geom_density(alpha = 0.6) + scale_fill_manual( values = c('darkorchid4','dodgerblue4')) + theme_bw() t.test(tumor_nuclei_percent ~ cluster,clinical.clust.df) ggplot(clinical.clust.df,aes(tumor_nuclei_percent,fill = cluster)) + xlab('Tumor Nuclei Percentage') + geom_density(alpha = 0.6) + scale_fill_manual( values = c('darkorchid4','dodgerblue4')) + theme_bw() ## ## BONUS: DETERMINING FEATURES THAT DRIVE CLUSTERS (this can take a bit of time) ## # miRNA kw.mirna <- data.frame(matrix(nrow = nrow(mirna),ncol = 2)) names(kw.mirna) <- c('miRNA','kw_pval') for(k in 1:nrow(mirna)){ kw.mirna[k,'miRNA'] <- rownames(mirna)[k] kw.mirna[k,'kw_pval'] <- kruskal.test(unlist(mirna[k,]) ~ clustering2)$p.value } # Methylation kw.methyl <- data.frame(matrix(nrow = nrow(methyl),ncol = 2)) names(kw.methyl) <- c('probe','kw_pval') for(k in 1:nrow(methyl)){ kw.methyl[k,'probe'] <- rownames(methyl)[k] kw.methyl[k,'kw_pval'] <- kruskal.test(unlist(methyl[k,]) ~ clustering2)$p.value } # mRNA kw.mrna <- data.frame(matrix(nrow = nrow(mrna),ncol = 2)) names(kw.mrna) <- c('gene','kw_pval') for(k in 1:nrow(mrna)){ kw.mrna[k,'gene'] <- rownames(mrna)[k] kw.mrna[k,'kw_pval'] <- kruskal.test(unlist(mrna[k,]) ~ clustering2)$p.value } # Put kruskal wallis data frames into a list kw.list <- list(kw.mrna,kw.methyl,kw.mirna) names(kw.list) <- c('mRNA','Methyl','miRNA') # Sort kw.list head(kw.methyl) kw.list.sorted <- lapply(kw.list,function(x)x[order(x = x[,'kw_pval']),]) ## Look at top associated genes and miRNA for each of the data sets lapply(kw.list.sorted,head) ## Note: numeric values for mRNA gene names -- important to label rows and columns in your data! ## ## BONUS: Create cytoscape file ## W_cyto <- W # Set lower triangle and diagonal to missing since matrix is symmetric W_cyto[lower.tri(W_cyto, diag = F)] <- NA W_cyto[1:6,1:6] ## look at first six rows and columns, lower triangle and diagonal are now 'NA' # Make matrix into a dataframe W_cyto_mlt <- melt(W_cyto) # Check it out -- lots of missing values head(W_cyto_mlt) # Let's overwrite the object with the missing values removed W_cyto_mlt <- W_cyto_mlt[!(is.na(W_cyto_mlt$value)),] # That's better! head(W_cyto_mlt) # Write a text file with the affinities for each of your patients relative to each other write.table(W_cyto_mlt,file = "SNF-affinity-matrix-cytoscape-infile.txt",quote=FALSE,row.names=FALSE,col.names=TRUE)