--- title: "Synaptome Stats: Marker Exploration" author: "Jesse Leigh Patsolic" date: '`r Sys.Date()`' output: html_document: fig_caption: yes fig_height: 5 fig_width: 5 fig_retina: 2.7 highlight: pygments keep_md: yes number_sections: yes theme: cerulean toc: yes toc_depth: 3 pdf_document: fig_caption: yes keep_tex: yes number_sections: yes --- ```{r render, eval=FALSE, echo=FALSE} require(rmarkdown) rm(list=ls()); system.time(rmarkdown::render(grep("MarkerExploration.Rmd", dir(), value=TRUE))) system('open MarkerExploration.html') system('say -r 200 Your R-script has completed') ``` ```{r setup,include=FALSE,results='asis'} ### Library calls here. library(rmarkdown) library(knitr) require(ggplot2) require(gridExtra) require(gplots) require(lattice) require(hexbin) require(data.table) library(deldir) # for voronoi require(dplyr) require(energy) require(lattice) require(colorRamps) require(corrplot) require(rgl) require(rglwidget) require(randomForest) require(doMC); registerDoMC(6L) require(foreach) require(MASS) require(abind) source("kmpp.r") source("clusterFraction.r") source("http://www.cis.jhu.edu/~parky/Synapse/getElbows.R") ``` ```{r knitrOpts, include=FALSE, results='asis', message=FALSE, warning=FALSE} ### The following options and figure numbering functions ### were setup by Youngser Park knitr::opts_chunk$set(cache=TRUE, autodep=TRUE) dep_auto() # figure out dependencies automatically opts_chunk$set(cache=FALSE, echo=TRUE, opts_chunk$set(fig.path = '../Figures/MarkerExploration_figure/'), warning=FALSE, message=FALSE, comment="#", dpi=227,dev=c('png','pdf')) opts_knit$set(aliases = c(h = 'fig.height', w = 'fig.width', cap='fig.cap', scap='figscap')) opts_knit$set(eval.after = c('fig.cap','fig.scap')) knit_hooks$set(document = function(x) { gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x) } ) #opts_knit$set(animation.fun = hook_scianimator) knit_hooks$set(plot = function(x, options) { paste('
', options$fig.cap, '
', sep = '') }) fn = local({ i = 0 function(x) { i <<- i + 1 paste('Figure ', i, ': ', x, sep = '') } }) fig <- local({ i <- 0 ref <- list() list( cap=function(refName, text) { i <<- i + 1 ref[[refName]] <<- i paste("Figure ", i, ": ", text, "

", sep="") }, ref=function(refName) { ref[[refName]] }) }) ``` [Homepage](http://docs.neurodata.io/synaptome-stats/) The formatted source code for this file is [here](https://github.com/neurodata/synaptome-stats/blob/gh-pages/Code/MarkerExploration.Rmd). And a raw version [here](https://raw.githubusercontent.com/neurodata/synaptome-stats/gh-pages/Code/MarkerExploration.Rmd). Previous work by Youngser Park can be found [here](http://www.cis.jhu.edu/~parky/Synapse/synapse.html). # Data Here we read in the data and select a random half of it for exploration. ```{r cc-data, eval=TRUE} featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE) ### Setting a seed and creating an index vector ### to select half of the data set.seed(2^10) half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2) half2 <- setdiff(1:dim(featFull)[1],half1) feat <- featFull[half1,] dim(feat) ## Setting the channel names channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3', 'psd','glur2','nmdar1','nr2b','gad','VGAT', 'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A', 'NOS','TH','VACht','Synapo','tubuli','DAPI') ## Setting the channel types channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small', 'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre', 'in.pre','in.post','in.post','in.post','in.pre.small','other', 'ex.post','other','other','ex.post','none','none') channel.type2 <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','other', 'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre', 'in.pre','in.post','in.post','in.post','other','other', 'ex.post','other','other','ex.post','other','other') nchannel <- length(channel) nfeat <- ncol(feat) / nchannel ## Createing factor variables for channel and channel type sorted properly ffchannel <- (factor(channel.type, levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none") )) fchannel <- as.numeric(factor(channel.type, levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none") )) ford <- order(fchannel) ## Setting up colors for channel types Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","#0000cd","#ffd700") Syncol3 <- c("#197300","#197300","#cc0000","#cc0000","#0000cd","#0000cd","#0000cd") ccol <- Syncol[fchannel] ccol3 <- Syncol3[fchannel] exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE) exCol<-exType;levels(exCol) <- c("#197300","#990000","#0000cd"); exCol <- as.character(exCol) fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5)))) names(feat) <- fname fcol <- rep(ccol, each=6) mycol <- colorpanel(100, "purple", "black", "green") mycol2 <- matlab.like(nchannel) mycol3 <- colorpanel(100, "blue","white","red") ``` ## Data transformations ```{r cc-datatrans, eval=TRUE} f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)}) featF <- lapply(f,function(x){subset(feat,select=x)}) featF0 <- featF[[1]] f01e3 <- lapply(featF,function(x){ 1e3*data.table(apply(X=x,2,function(y){((y-min(y))/(max(y)-min(y)))})) }) fs <- f01e3[[1]] ### Taking log_10 on data + 1. log1f <- log10(featF0 + 1) slog1f <- data.table(scale(log1f, center=TRUE,scale=TRUE)) ``` We now have the following data sets: - `featF0`: The feature vector looking only at the integrated brightness features. - `fs`: The feature vector scaled between $[0,1000]$. - `logf1`: The feature vector, plus one, then $log_{10}$ is applied. - `slog1f`: The feature vector, plus one, $log_{10}$, then scaled by subtracting the mean and dividing by the sample standard deviation. # Marker Exploration ## Correlation Matrix of markers ```{r cc_corRawF0,w=7,h=14, eval=TRUE,fig.cap=fig$cap("corr","Correlation on untransformed F0 data, reordered by synapse type.")} #corrf <- lapply(lapply(f01e3, cor), function(x) x[ford, ford]) corrf <- lapply(lapply(featF, cor), function(x) x[ford, ford]) titles <- paste('Correlation Matrix of', paste0("F", 0:5)) par(mfrow = c(3,2)) for(i in 1:length(corrf)) { corrplot(corrf[[i]],method="color",tl.col=ccol[ford], tl.cex=0.8, mar=c(1,0,1,1.5)) title(titles[i]) } ``` ```{r cc-bigCorr, w=8,h=8, eval=TRUE,fig.cap=fig$cap("corrB","Full Correlation on untransformed F0-F5 data, reordered by synapse type.")} bford <- order(rep(fchannel,each=6)) nord <- Reduce('c', f) cr <- rep(ccol, each=6) corrfB <- cor(feat)[bford,bford] corrplot(corrfB,method="color",tl.col=cr[bford],tl.cex=0.75) ``` ## Distance Correlation T-tests ```{r cc-dcor,eval=TRUE,echo=TRUE} computeDcor <- function(x) { set.seed(317) sam1 <- sample(dim(x)[1], min(1e3,dim(x)[1])) tmp <- as.data.frame((x[sam1,])) combcols <- t(combn(24,2)) dc <- foreach(i = 1:dim(combcols)[1]) %dopar% { set.seed(331*i) dcor.ttest(x=tmp[,combcols[i,1]],y=tmp[,combcols[i,2]]) } ms <- matrix(as.numeric(0),24,24) mp <- matrix(as.numeric(0),24,24) for(i in 1:length(dc)){ ms[combcols[i,1],combcols[i,2]] <- dc[[i]]$statistic ms[combcols[i,2],combcols[i,1]] <- dc[[i]]$statistic mp[combcols[i,1],combcols[i,2]] <- dc[[i]]$p.val mp[combcols[i,2],combcols[i,1]] <- dc[[i]]$p.val } rownames(ms) <- colnames(x) rownames(mp) <- colnames(x) colnames(ms) <- colnames(x) colnames(mp) <- colnames(x) diag(ms) <- as.numeric(0) diag(mp) <- as.numeric(1) return(list(ms, mp)) } mdcor <- lapply(featF, computeDcor) ``` ```{r cc-energyplot,h=17,w=7,fig.cap=fig$cap("enplt","F0-5: Distance correlation t-test statistic and p-value matrices.")} cl5 <- colorRampPalette(c("white", "blue")) gr5 <- colorRampPalette(c("darkgreen", "white", "white")) bl5 <- colorRampPalette(c("blue", "red")) sTitle <- paste("dcor.ttest statistic", paste0("F", 0:5)) pTitle <- paste("dcor.ttest log10(p-value)", paste0("F", 0:5)) par(mfcol=c(6,2), oma=2*c(1,1,1,1)) for(i in 1:length(mdcor)){ corrplot(mdcor[[i]][[1]][ford,ford],is.corr=FALSE,method="color", tl.col=ccol[ford], tl.cex=0.8, mar=3*c(1,0,1,1.5)) corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4) title(sTitle[i]) } for(i in 1:length(mdcor)){ corrplot((log10(mdcor[[i]][[2]][ford,ford]+.Machine$double.eps)),is.corr=FALSE,method="color",tl.col=ccol[ford], tl.cex=0.8, mar=3*c(1,0,1,1.5), p.mat=mdcor[[i]][[2]][ford,ford], sig.level=0.01, pch = 'x', pch.cex=0.5) corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4) title(pTitle[i]) } ``` The `X`'s in the above right figure denote a p-value greater than 0.01. ## PCA Scatterplots We will run PCA on the untransformed correlation matrix so the data can be viewed in 2-dimensions. The colors correspond to synapse type. ```{r cc-pca1,h=7,w=7,fig.cap=fig$cap("pairs1","PCA of untransformed correlation matrix")} pcaL <- lapply(corrf, prcomp, center=TRUE, scale=TRUE) titlepca <- paste("PCA on ", paste0("cor(F", 0:5, ')')) for(i in 1:length(pcaL)) { pairs(pcaL[[i]]$x[,1:3], col=ccol3[ford],pch=20, cex=2, main=titlepca[i]) } ``` ### PCA on cor(F0) ```{r cc_3dCorr,echo=TRUE, eval=TRUE, fig.cap=fig$cap("rgl1","PCA 1-3 on untransformed correlation matrix")} pca <- pcaL[[1]]$x rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s',col=ccol3[ford], size=1, xlab = "PC1", ylab = "PC2", zlab = "PC3") rgl::rgl.texts(pca[,1],pca[,2],pca[,3],abbreviate(rownames(pca)), col=ccol3[ford], adj=c(0,2)) title3d(main = "PCA on cor(F0)") subid <- currentSubscene3d() rglwidget(elementId="rgl-pca1",width=720,height=720) ``` ```{r cc-pcaB1,h=7,w=7,fig.cap=fig$cap("pairs2","PCA of untransformed correlation matrix")} pcaB <- prcomp(corrfB,center=TRUE, scale=TRUE) pairs(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2) ``` ```{r cc-pcaB2,h=7,w=7,fig.cap=fig$cap("pairs1","PC2 v PC1 of untransformed correlation matrix")} plot(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2) text(pcaB$x[,1:2], labels=rownames(pcaB$x), pos=4, col=cr[bford]) ``` ```{r cc_3dCorrB, fig.cap=fig$cap("rglB","PCA 1-3 on untransformed correlation matrix")} pcaB <- pcaB$x rgl::plot3d(pcaB[,1],pcaB[,2],pcaB[,3],type='s',col=cr[bford], size=1) rgl::rgl.texts(pcaB[,1],pcaB[,2],pcaB[,3],abbreviate(rownames(pcaB)), col=cr[bford], adj=c(0,2)) subid <- currentSubscene3d() rglwidget(elementId="rgl-pcaB",width=720,height=720) ``` # LDA ## Scree Plots First we will take a look at the scree plots for each of the primitives along with their corresponding elbows as given by Zhu and Ghodsi 2006. ```{r cc-scree1, echo=FALSE, h=6, w=7, fig.cap=fig$cap("scree1","Scree plots for F0-F5")} par(mfrow= c(2,3)) el2 <- lapply(pcaL, function(x) getElbows(x$sdev, plot = FALSE)) tmp <- mapply(function(x,y) { getElbows(x$sdev, plot = TRUE) title(y) }, pcaL, paste0("Scree plot F", 0:5)) ``` ## Cumulative Variance plots ```{r cc-cumsum1, h=6, w=7, fig.cap=fig$cap("cumsum1","% Variance Explained for F0-F5")} par(mfrow= c(2,3)) for(i in 1:length(pcaL)){ zs <- rep(1, dim(pcaL[[i]]$x)[2]) zb <- rep(0, dim(pcaL[[i]]$x)[2]) zs[el2[[i]]] <- 2 zb[el2[[i]]] <- 2 plot(100 * cumsum(pcaL[[i]]$sdev) / sum(pcaL[[i]]$sdev), axes = TRUE, type='b', pch = 21, bg = zb, col = zs, xlab = 'd', ylab = '% Var', main = paste0('% Var Explained for F', i - 1)) axis(side = 1, at = el2[[i]], padj = 1.25) } ``` ```{r cc-lda1} dat <- lapply(pcaL, function(x) data.frame(x$x)) type <- truth <- factor(exType, ordered = FALSE) tr <- as.numeric(truth) lda.fit <- list() lda.fit[[1]] <- lda(type ~ ., data = dat[[1]][,1:el2[[1]][2]]) lda.fit[[2]] <- lda(type ~ ., data = dat[[2]][,1:el2[[2]][2]]) lda.fit[[3]] <- lda(type ~ ., data = dat[[3]][,1:el2[[3]][2]]) lda.fit[[4]] <- lda(type ~ ., data = dat[[4]][,1:el2[[4]][2]]) lda.fit[[5]] <- lda(type ~ ., data = dat[[5]][,1:el2[[5]][2]]) lda.fit[[6]] <- lda(type ~ ., data = dat[[6]][,1:el2[[6]][1]]) lda.pred <- lapply(lda.fit, predict) ``` ```{r cc-lda-cep} Lm <- foreach(i = 1:6, .combine=rbind) %:% foreach(j = 1:24, .combine=rbind) %do% { out <- NULL try({ set.seed(12) ldapred <- as.numeric(predict(lda(type ~ ., dat[[i]][1:j]))$class) er <- 1/24 * sum(ldapred != as.numeric(truth)) out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1)) }, silent=TRUE) out } rownames(Lm) <- NULL Lm <- data.table(Lm) Lcvm <- foreach(i = 1:6, .combine=rbind) %:% foreach(j = 1:24, .combine=rbind) %do% { out <- NULL try({ set.seed(12) ldapred <- as.numeric((lda(type ~ ., dat[[i]][1:j], CV = TRUE))$class) er <- 1/24 * sum(ldapred != as.numeric(truth)) out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1)) }, silent=TRUE) out } rownames(Lcvm) <- NULL Lcvm <- data.table(Lcvm) Lm$CV <- "Without CV" Lcvm$CV <- "With CV" tmp <- data.table(rbind(Lm, Lcvm)) tmp$CV <- factor(tmp$CV) tmp$feat <- factor(tmp$feat) size <- foreach(i = 1:6, .combine=c) %do% { size <- rep(1, 23) size[el2[[i]]] <- 2 size } tmp$size <- factor(c(size,size)) levels(tmp$size) <- c("non-elbow", "elbow") ``` ## Misclassification rates for LDA w/ and w/o CV. ```{r cc-cvPlot, h=6, w = 8, fig.cap=fig$cap("cvPlot","Lhat vs $d$ for FLD on F0-F5 with an h-line at chance")} xvalVr <- ggplot(tmp, aes(x = d, y = Lhat, color = feat)) + facet_grid(. ~ CV) + geom_line(alpha = 0.5) + geom_point(alpha = 0.65, aes(size = size)) + geom_hline(yintercept = 0.64236) + ggtitle("FLD (LDA): xval vs. re-substitution") print(xvalVr) ``` ## Misclasifications vs $\hat d$. ```{r cc-lda-misclass, h=10, w=8, fig.cap = fig$cap("misclassLDAcv", "Misclassifications using LDA with CV for various $d$.")} L <- foreach(i = 1:6) %:% foreach(j = 1:24, .combine = cbind) %do% { out <- NULL try({ set.seed(12) ldapredcv <- as.numeric(lda(type ~ ., dat[[i]][1:j], CV = TRUE)$class) out <- ldapredcv }, silent=TRUE) out } A <- lapply(L, function(x) { apply(x, 2, function(y) y != tr ) }) B <- lapply(A, function(x) { x[,1:10] } ) B <- lapply(B, function(x) { rownames(x) <- channel[ford] colnames(x) <- paste0('d', 1:dim(x)[2]) return(x) }) par(mfrow = c(2,3)) corT <- paste0("Misclassifications ", "F", 0:5) for(i in 1:6) { corrplot(B[[i]], method='color', addgrid.col=1, tl.col = ccol[ford], mar=3*c(1,0,1,1.5)) title(corT[i]) } ``` The above plots show when LDA with cross-validation misclassifies each datapoint. A misclassification is denoted by a filled block and columns denote the embedding dimension $\hat d$. ```{r cc-voronoi,h=12,w=8,fig.cap=fig$cap("voronoi","Voronoi diagrams on class means from LDA on PCA of untransformed correlation matrices")} titlesvor <- paste("LDA decision boundaries for", paste0("F", 0:5)) voronoidf <- lapply(lapply(lda.fit, '[[', 3), data.frame) #voronoidf <- data.frame(x=lda.fit$means[,1],y=lda.fit$means[,2]) #This creates the voronoi line segments par(mfrow = c(3,2)) for(i in 1:length(dat)){ plot(dat[[i]][,1:2], col=ccol3[ford], pch=20, cex=1.5) title(titlesvor[i]) text(dat[[i]][,1:2], labels=rownames(dat[[i]]), pos=ifelse(dat[[i]][,1]

[Back to Top][Data]