--- title: "Synapse Clustering: Random Half" author: "JLP" date: '`r Sys.Date()`' output: html_document: fig_caption: yes fig_height: 5 fig_width: 5 fig_retina: 2 highlight: pygments keep_md: yes number_sections: yes theme: cerulean toc: yes toc_depth: 3 pdf_document: fig_caption: yes keep_tex: yes number_sections: yes --- ```{r render, eval=FALSE, echo=FALSE} require(rmarkdown) rm(list=ls()); ### SAVE DOCUMENT!!! rmarkdown::render("./randomHalf.Rmd"); system('open randomHalf.html') ``` ```{r setup,include=FALSE,results='asis'} suppressMessages(require(Matrix)) suppressMessages(require(colorRamps)) suppressMessages(require(corrplot)) suppressMessages(require(gplots)) suppressMessages(require(dplyr)) suppressMessages(require(reshape2)) suppressMessages(require(ggplot2)) suppressMessages(require(parallel)) suppressMessages(require(data.table)) suppressMessages(require(fpc)) suppressMessages(require(repr)) suppressMessages(require(doMC)) suppressMessages(require(mclust)) suppressMessages(require(rgl)) suppressMessages(require(rglwidget)) source("./bic.r") source("./clusterFraction.r") source("http://www.cis.jhu.edu/~parky/Synapse/getElbows.R") registerDoMC(6) library(rmarkdown) library(knitr) knitr::opts_chunk$set(cache=TRUE, autodep=TRUE) dep_auto() # figure out dependencies automatically opts_chunk$set(cache=FALSE,echo=TRUE,warning=FALSE,message=FALSE,comment="#",fig.path='../Figures/randomHalf_figure/', dpi=100,dev=c('png','pdf')) opts_knit$set(aliases=c(h='fig.height', w='fig.width', cap='fig.cap', scap='fig.scap')) opts_knit$set(eval.after = c('fig.cap','fig.scap')) knit_hooks$set(document = function(x) { gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x) }) #opts_knit$set(animation.fun = hook_scianimator) knit_hooks$set(webgl=hook_webgl) knit_hooks$set(plot = function(x, options) { paste('

', sep = '') }) fn = local({ i = 0 function(x) { i <<- i + 1 paste('Figure ', i, ': ', x, sep = '') } }) fig <- local({ i <- 0 ref <- list() list( cap=function(refName, text) { i <<- i + 1 ref[[refName]] <<- i paste("Figure ", i, ": ", text, "

", sep="") }, ref=function(refName) { ref[[refName]] }) }) ``` The formatted source code for this file is [here](https://github.com/neurodata/synaptome-stats/blob/gh-pages/Code/randomHalf.Rmd). And a [raw version here](https://raw.githubusercontent.com/neurodata/synaptome-stats/gh-pages/Code/randomHalf.Rmd). Previous work by Youngser Park can be found [here](http://www.cis.jhu.edu/~parky/Synapse/synapse.html). # Introduction # Data > The corresponds to 24 channels x 6 features per synapse, ordered like > c0f0,c0f1,c0f2,c0f3,c0f4,c0f5,c1f0,c1f1... etc > >f0 = integrated brightness >f1 = local brightness >f2 = distance to Center of Mass >f3 = moment of inertia around synapsin maxima >f4,f5 are features that I forget what they are.. would need to ask brad. >i would throw them out, I did so in my kohonen code (which you have, its in matlab). and > On Feb 8, 2016, at 2:00 PM, Kristina Micheva wrote: * _Excitatory presynaptic: 'Synap', 'Synap', 'VGlut1', 'VGlut1', 'VGlut2'_, * _Excitatory postsynaptic: 'psd', 'glur2', 'nmdar1', 'nr2b', 'NOS', 'Synapo'_ (but further away than PSD, gluR2, nmdar1 and nr2b) * _Inhibitory presynaptic: 'gad', 'VGAT', 'PV'_, * _Inhibitory postsynaptic: 'Gephyr', 'GABAR1', 'GABABR', 'NOS'_, * _At a very small number of inhibitory: 'Vglut3' (presynaptic), 'CR1'(presynaptic)_, * _Other synapses:'5HT1A', 'TH', 'VACht'_, * _Not at synapses: 'tubuli', 'DAPI'_. and > On March 10, 2016, 00:29:04 (UTC), Kristina Micheva wrote: > There are 2 different Synap channels (2 different antibodies were > used), so that part is fine. > And 2 different VGluT1 channels (same antibody but done at different > times) > The NOS channel is the same, so count it as one even though it appears > twice. It is listed two times because it can be found at both excitatory > and inhibitory synapses. This is where your count of 25 comes, even > though there are 24 channels. > I would also add the 2 Synap channels to the Inhibitory presynaptic > category - there is supposed to be synapsin there, but at lower levels > compared to excitatory presynaptic category. - Note: The order of the channels are given by line `227` in the `kohenen.m` file which can be found in the dropbox folder. - `Synap` and `Synap` have been augmented to `Synap_1` and `Synap_2` for clarity. - `VGlut1` and `VGlut1` have been augmented to `VGlut1_t1` and `VGlut1_t2` to distinguish between the different times (which are unknown). ```{r cc_data, eval=TRUE} feat <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE) dim(feat) ### Setting a seed and creating an index vector ### to select half of the data set.seed(2^13) half1 <- sample(dim(feat)[1],dim(feat)[1]/2) feat <- feat[half1,] channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3', 'psd','glur2','nmdar1','nr2b','gad','VGAT', 'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A', 'NOS','TH','VACht','Synapo','tubuli','DAPI') ## NOS as ex.post #channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small', # 'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre', # 'in.pre','in.post','in.post','in.post','in.pre.small','other', # 'ex.post','other','other','ex.post','none','none') # ## NOS as in.post channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small', 'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre', 'in.pre','in.post','in.post','in.post','in.pre.small','other', 'in.post','other','other','ex.post','none','none') nchannel <- length(channel) nfeat <- ncol(feat) / nchannel fchannel <- as.numeric(factor(channel.type, levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none") )) ford <- order(fchannel) #ccol <- rainbow(max(fchannel))[fchannel] Syncol <- c("#197300","#5ed155","#990000","#cc0000","#ff9933","mediumblue","gold") ccol <- Syncol[fchannel] exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE) exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue"); exCol <- as.character(exCol) fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5)))) names(feat) <- fname fcol <- rep(ccol, each=6) mycol <- colorpanel(100, "purple", "black", "green") mycol2 <- matlab.like(nchannel) ``` ## Transformations Each of the channels has an arbitrary independent linear transformation and thus converting to z-scores or quantiles is required. Selecting only the `f0` features, we will use both the feature vector and a transformed version, transforming by adding 1 and taking the log base 10 for each column and scaling. ```{r cc_featF0, eval=TRUE} f0 <- seq(1,ncol(feat),by=nfeat) fRaw <- subset(feat, select=f0) #fLog <- apply(X=(fRaw+1),2,log10) #fLogScaled <- scale(flog, center=TRUE, scale=TRUE) eps <- 2*.Machine$double.eps f01 <- apply(X=fRaw, 2, function(x){((x-min(x))/(max(x)-min(x)))}) fRank <- apply(fRaw,2,rank,ties.method='average') fLog <- apply(f01,2,function(x){ log10(x+eps)}) ``` # Correlations ## Raw ```{r cc_corRaw, eval=TRUE, fig.cap=fig$cap("corRaw", "Correlation on raw data, reordered by synapse type.")} tmp <- as.numeric(table(fchannel)) cmatRaw <- cor(fRaw) corrplot(cmatRaw[ford,ford],method="color",tl.col=ccol[ford]) corrRect(tmp,col=Syncol,lwd=4) ``` ## Log ```{r cc_corLog, eval=TRUE, fig.cap=fig$cap("corLog", "Correlation on log data, reordered by synapse type.")} cmatLog <- cor(fLog) corrplot(cmatLog[ford,ford],method="color",tl.col=ccol[ford]) corrRect(tmp,col=Syncol,lwd=4) ``` ## Rank ```{r cc_corRank, eval=TRUE, fig.cap=fig$cap("corRank", "Correlation on rank data, reordered by synapse type.")} cmatRank <- cor(fRank) corrplot(cmatRank[ford,ford],method="color",tl.col=ccol[ford]) corrRect(tmp,col=Syncol,lwd=4) ``` # PCA and Scree Plots on the Correlation Matrices Note that centering and scaling are set to `FALSE`. ```{r pcaRaw, eval=TRUE} pcaRaw <- prcomp(cmatRaw,scale=FALSE, center=FALSE) elRaw <- getElbows(pcaRaw$sdev, plot=FALSE) ``` ```{r pcaLog, eval=TRUE} pcaLog <- prcomp(cmatLog,scale=FALSE, center=FALSE) elLog <- getElbows(pcaLog$sdev, plot=FALSE) ``` ```{r pcaRank, eval=TRUE} pcaRank <- prcomp(cmatRank,scale=FALSE, center=FALSE) elRank <- getElbows(pcaRank$sdev, plot=FALSE) ``` ```{r cc-scree-plots, eval=TRUE, w=8,h=4, fig.cap=fig$cap("scree-plots","Scree plots")} par(mfrow=c(1,3)) plot(pcaRaw$sdev, type='b'); title("Scree plot: Raw Cor. Matrix") points(cbind(elRaw,pcaRaw$sdev[elRaw]), col='red', pch=20) plot(pcaLog$sdev, type='b'); title("Scree plot: Log Cor. Matrix") points(cbind(elLog,pcaLog$sdev[elLog]), col='red', pch=20) plot(pcaRank$sdev, type='b'); title("Scree plot: Rank Cor. Matrix") points(cbind(elRank,pcaRank$sdev[elRank]), col='red', pch=20) ``` # PAMK Running `pamk` for $K = 7$. ## PAMK on Raw data ```{r cc_pamkRaw, eval=TRUE, fig.cap=fig$cap("silRaw", "Average silhouette width plot on Raw Cor. Matrix")} K <- 7 pamRaw7 <- pamk(pcaRaw$x[,1:elRaw[2]],krange=K,usepam=FALSE,critout=FALSE) ``` ```{r cc_aggRaw, eval=TRUE} aggRaw <- aggregate(pcaRaw$x[,1:elRaw[2]],by=list(lab=pamRaw7$pamo$clustering),FUN=mean) aggRaw <- as.matrix(aggRaw[,-1]) rownames(aggRaw) <- clusterFraction(pamRaw7$pamo$clustering) ``` ```{r cc_heatmapRaw,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-raw", "Heat Map of Raw Data")} heatmap.2(as.matrix(aggRaw), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## PAMK on Log Data ```{r cc_pamkLog, eval=TRUE, fig.cap=fig$cap("silLog", "Average silhouette width plot on Log Cor. Matrix")} pamLog7 <- pamk(pcaLog$x[,1:elLog[2]],krange=K,usepam=FALSE,critout=FALSE) #plot(pamLog5$crit, type='b') ``` ```{r cc_aggLog, eval=TRUE} aggLog <- aggregate(pcaLog$x[,1:elLog[2]],by=list(lab=pamLog7$pamo$clustering),FUN=mean) aggLog <- as.matrix(aggLog[,-1]) rownames(aggLog) <- clusterFraction(pamLog7$pamo$clustering) ``` ```{r cc_heatmapLog,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-log", "Heat Map of Log Data")} heatmap.2(as.matrix(aggLog), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## PAMK on Ranked Data ```{r cc_pamkRank, eval=TRUE, fig.cap=fig$cap("silRank", "Average silhouette width plot on Rank Cor. Matrix")} pamRank7 <- pamk(pcaRank$x[,1:elRank[2]],krange=K,usepam=FALSE,critout=FALSE) #plot(pamRank5$crit, type='b') ``` ```{r cc_aggRank, eval=TRUE} aggRank <- aggregate(pcaRank$x[,1:elRank[2]],by=list(lab=pamRank7$pamo$clustering),FUN=mean) aggRank <- as.matrix(aggRank[,-1]) rownames(aggRank) <- clusterFraction(pamRank7$pamo$clustering) ``` ```{r cc_heatmapRank,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-rank", "Heat Map of Ranked Data")} heatmap.2(as.matrix(aggRank), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## ARI We are considering the channel types to be the "ground truth" here. ```{r cc_truth} truth <- as.numeric(factor(channel.type)) ``` Calculating the ARI for the raw data. ```{r ariRaw} xRaw <- as.numeric(pamRaw7$pamo$clustering) ariRaw <- foreach(i = 1:1e4, .combine='c') %dopar%{ y <- sample(xRaw) mclust::adjustedRandIndex(truth,y) } ``` ```{r ariLog} xLog <- as.numeric(pamLog7$pamo$clustering) ariLog <- foreach(i = 1:1e4, .combine='c') %dopar%{ y <- sample(xLog) mclust::adjustedRandIndex(truth,y) } ``` ```{r ariRank} xRank <- as.numeric(pamRank7$pamo$clustering) ariRank <- foreach(i = 1:1e4, .combine='c') %dopar%{ y <- sample(xRank) mclust::adjustedRandIndex(truth,y) } ``` ```{r cc_ariPlots, w=9,h=4, fig.cap=fig$cap("ariPlot","ARI plots")} par(mfrow=c(1,3)) hist(ariRaw) abline(v = adjustedRandIndex(truth,xRaw), lwd=2, col='red') hist(ariLog) abline(v = adjustedRandIndex(truth,xLog), lwd=2, col='red') hist(ariRank) abline(v = adjustedRandIndex(truth,xRank), lwd=2, col='red') ``` ```{r cc_ariNum} adjustedRandIndex(truth, xRaw) adjustedRandIndex(truth, xLog) adjustedRandIndex(truth, xRank) ``` ## ARI vs Embedding dimension. Here we investigeta how ARI varies with the embedding dimension. ### Functions ```{r cc_f1} avd <- function(pcaObj,elbow,truth){ ### ### Given a pca object with an elbow as upperbound ### compute the ARI of truth versus the pamk clustering for each ### dimension from 1:elbow ### require(foreach) out <- foreach(i = 1:elbow, .combine="rbind") %do% { pmk <- pamk(pcaObj$x[,1:i], krange=K, usepam=FALSE, critout=FALSE) data.frame(Embedding_Dimension=i,ARI=adjustedRandIndex(truth, pmk$pamo$clustering)) } return(out) } ``` ### ARI Plots ```{r arivdimRaw, eval=TRUE, w=9,h=4, fig.cap=fig$cap("ariVdim", "ARI vs DIM")} avdRaw <- avd(pcaRaw, elRaw[2], truth) avdLog <- avd(pcaLog, elLog[2], truth) avdRank <- avd(pcaRank, elRank[2], truth) avdRaw$group <- "Raw" avdLog$group <- "Log" avdRank$group <- "Rank" avdDat <- data.frame(rbind(avdRaw, avdLog, avdRank)) #par(mfrow=c(1,3)) #plot(avdRaw, type='b', main="Raw Data") #plot(avdLog, type='b', main="Log Data") #plot(avdRank, type='b', main="Rank Data") p1 <- ggplot(data=avdDat, aes(x=Embedding_Dimension, y=ARI,group=group,color=group)) + geom_line(size=1.5) + geom_point(size=3) + theme(axis.text=element_text(size=18)) + ggtitle("ARI vs. DIM") print(p1) ``` # Plots of embeddings First let's look at the 3-d embeddings and make some decisions from there. ```{r cc_3dEmb, h=6,w=6} pairs(pcaRaw$x[,1:3],col=exCol,pch=as.numeric(exType)+15,cex=2,main="Raw") pairs(pcaLog$x[,1:3],col=exCol, pch=as.numeric(exType)+15,cex=2,main="Log") pairs(pcaRank$x[,1:3],col=exCol,pch=as.numeric(exType)+15,cex=2,main="Rank") ``` ```{r cc_3dRaw} pca <- pcaRaw$x[,1:3] rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Raw") rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5)) subid <- currentSubscene3d() rglwidget(elementId="plot3dRaw") ``` ```{r cc_3dLog} pca <- pcaLog$x rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Log") rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5)) subid <- currentSubscene3d() rglwidget(elementId="plot3dLog") ``` ```{r cc_3dRank} pca <- pcaRank$x[,1:3] rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=3, main="Rank") rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5)) subid <- currentSubscene3d() rglwidget(elementId="plot3dRank") ``` # Pick $K=3$. ## PAMK=3 on Raw data ```{r cc_pamk3Raw, eval=TRUE, fig.cap=fig$cap("silRaw", "Average silhouette width plot on Raw Cor. Matrix")} K <- 3 pamRaw3 <- pamk(pcaRaw$x[,1:elRaw[2]],krange=K,usepam=FALSE,critout=FALSE) ``` ```{r cc_agg3Raw, eval=TRUE} agg3Raw <- aggregate(pcaRaw$x[,1:elRaw[2]],by=list(lab=pamRaw3$pamo$clustering),FUN=mean) agg3Raw <- as.matrix(agg3Raw[,-1]) rownames(agg3Raw) <- clusterFraction(pamRaw3$pamo$clustering) ``` ```{r cc_heatmap3Raw,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-3raw", "Heat Map of Raw Data")} heatmap.2(as.matrix(agg3Raw), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## PAMK=3 on Log Data ```{r cc_pamk3Log, eval=TRUE, fig.cap=fig$cap("silLog", "Average silhouette width plot on Log Cor. Matrix")} pamLog3 <- pamk(pcaLog$x[,1:elLog[2]],krange=K,usepam=FALSE,critout=FALSE) #plot(pamLog5$crit, type='b') ``` ```{r cc_agg3Log, eval=TRUE} agg3Log <- aggregate(pcaLog$x[,1:elLog[2]],by=list(lab=pamLog3$pamo$clustering),FUN=mean) agg3Log <- as.matrix(agg3Log[,-1]) rownames(agg3Log) <- clusterFraction(pamLog3$pamo$clustering) ``` ```{r cc_heatmap3Log,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-3log", "Heat Map of Log Data")} heatmap.2(as.matrix(agg3Log), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## PAMK=3 on Ranked Data ```{r cc_pamk3Rank, eval=TRUE, fig.cap=fig$cap("silRank", "Average silhouette width plot on Rank Cor. Matrix")} pamRank3 <- pamk(pcaRank$x[,1:elRank[2]],krange=K,usepam=FALSE,critout=FALSE) #plot(pamRank5$crit, type='b') ``` ```{r cc_agg3Rank, eval=TRUE} agg3Rank <- aggregate(pcaRank$x[,1:elRank[2]],by=list(lab=pamRank3$pamo$clustering),FUN=mean) agg3Rank <- as.matrix(agg3Rank[,-1]) rownames(agg3Rank) <- clusterFraction(pamRank3$pamo$clustering) ``` ```{r cc_heatmap3Rank,eval=TRUE,w=6,h=6,echo=TRUE,fig.cap=fig$cap("heat-3rank", "Heat Map of Ranked Data")} heatmap.2(as.matrix(agg3Rank), trace="none",col=mycol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90) # ``` ## ARI Plots for $K=3$ ```{r arivdim3Raw, eval=TRUE, w=9,h=4, fig.cap=fig$cap("ariVdim", "ARI vs DIM")} avd3Raw <- avd(pcaRaw,elRaw[2],truth) avd3Log <- avd(pcaLog,elLog[2],truth) avd3Rank <- avd(pcaRank,elRank[2],truth) avd3Raw$group <- "Raw" avd3Log$group <- "Log" avd3Rank$group <- "Rank" avd3Dat <- data.frame(rbind(avd3Raw, avd3Log, avd3Rank)) p2 <- ggplot(data=avd3Dat, aes(x=Embedding_Dimension, y=ARI,group=group,color=group)) + geom_line(size=1.5) + geom_point(size=3) + theme(axis.text=element_text(size=18)) + ggtitle("ARI vs. DIM") print(p2) ``` ```{r test, eval=FALSE} pca <- pcaRaw$x[,1:3] rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s', col=exCol, size=1, main="Raw") rgl::rgl.texts(pca[,1],pca[,2],pca[,3], channel[ford], col=exCol, adj=c(0,1.5)) subid <- currentSubscene3d() rglwidget(elementId="plot3dRaw") open3d() plot3d( cube3d(col = "green") ) M <- par3d("userMatrix") if (!rgl.useNULL()) play3d( par3dinterp(time = (0:2)*4, userMatrix = list(M, rotate3d(M, pi/2, 1, 0, 0), rotate3d(M, pi/2, 0, 0, 1) ) ), duration = 18 ) movie3d( spin3d(), duration = 5) subid <- currentSubscene3d() rglwidget(elementId="plot3dRank") ```