---
title: "Synaptome Stats: Marker Exploration"
author: "Jesse Leigh Patsolic"
date: '`r Sys.Date()`'
output:
html_document:
fig_caption: yes
fig_height: 5
fig_width: 5
fig_retina: 2.7
highlight: pygments
keep_md: yes
number_sections: yes
theme: cerulean
toc: yes
toc_depth: 3
pdf_document:
fig_caption: yes
keep_tex: yes
number_sections: yes
---
```{r render, eval=FALSE, echo=FALSE}
require(rmarkdown)
rm(list=ls());
system.time(rmarkdown::render(grep("MarkerExploration.Rmd", dir(), value=TRUE)))
system('open MarkerExploration.html')
system('say -r 200 Your R-script has completed')
```
```{r setup,include=FALSE,results='asis'}
### Library calls here.
library(rmarkdown)
library(knitr)
require(ggplot2)
require(gridExtra)
require(gplots)
require(lattice)
require(hexbin)
require(data.table)
library(deldir) # for voronoi
require(dplyr)
require(energy)
require(lattice)
require(colorRamps)
require(corrplot)
require(rgl)
require(rglwidget)
require(randomForest)
require(doMC); registerDoMC(6L)
require(foreach)
require(MASS)
require(abind)
source("kmpp.r")
source("clusterFraction.r")
source("http://www.cis.jhu.edu/~parky/Synapse/getElbows.R")
```
```{r knitrOpts, include=FALSE, results='asis', message=FALSE, warning=FALSE}
### The following options and figure numbering functions
### were setup by Youngser Park
knitr::opts_chunk$set(cache=TRUE, autodep=TRUE)
dep_auto() # figure out dependencies automatically
opts_chunk$set(cache=FALSE, echo=TRUE,
opts_chunk$set(fig.path = '../Figures/MarkerExploration_figure/'),
warning=FALSE, message=FALSE,
comment="#",
dpi=227,dev=c('png','pdf'))
opts_knit$set(aliases = c(h = 'fig.height', w = 'fig.width',
cap='fig.cap', scap='figscap'))
opts_knit$set(eval.after = c('fig.cap','fig.scap'))
knit_hooks$set(document = function(x) {
gsub('(\\\\end\\{knitrout\\}[\n]+)', '\\1\\\\noindent ', x)
}
)
#opts_knit$set(animation.fun = hook_scianimator)
knit_hooks$set(plot = function(x, options) {
paste('',
sep = '')
})
fn = local({
i = 0
function(x) {
i <<- i + 1
paste('Figure ', i, ': ', x, sep = '')
}
})
fig <- local({
i <- 0
ref <- list()
list(
cap=function(refName, text) {
i <<- i + 1
ref[[refName]] <<- i
paste("Figure ", i, ": ", text, "
", sep="")
},
ref=function(refName) {
ref[[refName]]
})
})
```
[Homepage](http://docs.neurodata.io/synaptome-stats/)
The formatted source code for this file is [here](https://github.com/neurodata/synaptome-stats/blob/gh-pages/Code/MarkerExploration.Rmd).
And a raw version [here](https://raw.githubusercontent.com/neurodata/synaptome-stats/gh-pages/Code/MarkerExploration.Rmd).
Previous work by Youngser Park can be found [here](http://www.cis.jhu.edu/~parky/Synapse/synapse.html).
# Data
Here we read in the data and select a random half of it for exploration.
```{r cc-data, eval=TRUE}
featFull <- fread("../data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^10)
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
half2 <- setdiff(1:dim(featFull)[1],half1)
feat <- featFull[half1,]
dim(feat)
## Setting the channel names
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
'psd','glur2','nmdar1','nr2b','gad','VGAT',
'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
'NOS','TH','VACht','Synapo','tubuli','DAPI')
## Setting the channel types
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','in.pre.small','other',
'ex.post','other','other','ex.post','none','none')
channel.type2 <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','other',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','other','other',
'ex.post','other','other','ex.post','other','other')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
## Createing factor variables for channel and channel type sorted properly
ffchannel <- (factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
ford <- order(fchannel)
## Setting up colors for channel types
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","#0000cd","#ffd700")
Syncol3 <- c("#197300","#197300","#cc0000","#cc0000","#0000cd","#0000cd","#0000cd")
ccol <- Syncol[fchannel]
ccol3 <- Syncol3[fchannel]
exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","#0000cd");
exCol <- as.character(exCol)
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
mycol3 <- colorpanel(100, "blue","white","red")
```
## Data transformations
```{r cc-datatrans, eval=TRUE}
f <- lapply(1:6,function(x){seq(x,ncol(feat),by=nfeat)})
featF <- lapply(f,function(x){subset(feat,select=x)})
featF0 <- featF[[1]]
f01e3 <- lapply(featF,function(x){
1e3*data.table(apply(X=x,2,function(y){((y-min(y))/(max(y)-min(y)))}))
})
fs <- f01e3[[1]]
### Taking log_10 on data + 1.
log1f <- log10(featF0 + 1)
slog1f <- data.table(scale(log1f, center=TRUE,scale=TRUE))
```
We now have the following data sets:
- `featF0`: The feature vector looking only at the integrated brightness features.
- `fs`: The feature vector scaled between $[0,1000]$.
- `logf1`: The feature vector, plus one, then $log_{10}$ is applied.
- `slog1f`: The feature vector, plus one, $log_{10}$, then scaled by
subtracting the mean and dividing by the sample standard deviation.
# Marker Exploration
## Correlation Matrix of markers
```{r cc_corRawF0,w=7,h=14, eval=TRUE,fig.cap=fig$cap("corr","Correlation on untransformed F0 data, reordered by synapse type.")}
#corrf <- lapply(lapply(f01e3, cor), function(x) x[ford, ford])
corrf <- lapply(lapply(featF, cor), function(x) x[ford, ford])
titles <- paste('Correlation Matrix of', paste0("F", 0:5))
par(mfrow = c(3,2))
for(i in 1:length(corrf)) {
corrplot(corrf[[i]],method="color",tl.col=ccol[ford],
tl.cex=0.8, mar=c(1,0,1,1.5))
title(titles[i])
}
```
```{r cc-bigCorr, w=8,h=8, eval=TRUE,fig.cap=fig$cap("corrB","Full Correlation on untransformed F0-F5 data, reordered by synapse type.")}
bford <- order(rep(fchannel,each=6))
nord <- Reduce('c', f)
cr <- rep(ccol, each=6)
corrfB <- cor(feat)[bford,bford]
corrplot(corrfB,method="color",tl.col=cr[bford],tl.cex=0.75)
```
## Distance Correlation T-tests
```{r cc-dcor,eval=TRUE,echo=TRUE}
computeDcor <- function(x) {
set.seed(317)
sam1 <- sample(dim(x)[1], min(1e3,dim(x)[1]))
tmp <- as.data.frame((x[sam1,]))
combcols <- t(combn(24,2))
dc <- foreach(i = 1:dim(combcols)[1]) %dopar% {
set.seed(331*i)
dcor.ttest(x=tmp[,combcols[i,1]],y=tmp[,combcols[i,2]])
}
ms <- matrix(as.numeric(0),24,24)
mp <- matrix(as.numeric(0),24,24)
for(i in 1:length(dc)){
ms[combcols[i,1],combcols[i,2]] <- dc[[i]]$statistic
ms[combcols[i,2],combcols[i,1]] <- dc[[i]]$statistic
mp[combcols[i,1],combcols[i,2]] <- dc[[i]]$p.val
mp[combcols[i,2],combcols[i,1]] <- dc[[i]]$p.val
}
rownames(ms) <- colnames(x)
rownames(mp) <- colnames(x)
colnames(ms) <- colnames(x)
colnames(mp) <- colnames(x)
diag(ms) <- as.numeric(0)
diag(mp) <- as.numeric(1)
return(list(ms, mp))
}
mdcor <- lapply(featF, computeDcor)
```
```{r cc-energyplot,h=17,w=7,fig.cap=fig$cap("enplt","F0-5: Distance correlation t-test statistic and p-value matrices.")}
cl5 <- colorRampPalette(c("white", "blue"))
gr5 <- colorRampPalette(c("darkgreen", "white", "white"))
bl5 <- colorRampPalette(c("blue", "red"))
sTitle <- paste("dcor.ttest statistic", paste0("F", 0:5))
pTitle <- paste("dcor.ttest log10(p-value)", paste0("F", 0:5))
par(mfcol=c(6,2), oma=2*c(1,1,1,1))
for(i in 1:length(mdcor)){
corrplot(mdcor[[i]][[1]][ford,ford],is.corr=FALSE,method="color",
tl.col=ccol[ford], tl.cex=0.8, mar=3*c(1,0,1,1.5))
corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
title(sTitle[i])
}
for(i in 1:length(mdcor)){
corrplot((log10(mdcor[[i]][[2]][ford,ford]+.Machine$double.eps)),is.corr=FALSE,method="color",tl.col=ccol[ford],
tl.cex=0.8, mar=3*c(1,0,1,1.5),
p.mat=mdcor[[i]][[2]][ford,ford],
sig.level=0.01, pch = 'x', pch.cex=0.5)
corrRect(as.numeric(table(fchannel)),col=Syncol,lwd=4)
title(pTitle[i])
}
```
The `X`'s in the above right figure denote a p-value greater than 0.01.
## PCA Scatterplots
We will run PCA on the untransformed correlation matrix so the data can be
viewed in 2-dimensions. The colors correspond to synapse type.
```{r cc-pca1,h=7,w=7,fig.cap=fig$cap("pairs1","PCA of untransformed correlation matrix")}
pcaL <- lapply(corrf, prcomp, center=TRUE, scale=TRUE)
titlepca <- paste("PCA on ", paste0("cor(F", 0:5, ')'))
for(i in 1:length(pcaL)) {
pairs(pcaL[[i]]$x[,1:3], col=ccol3[ford],pch=20, cex=2,
main=titlepca[i])
}
```
### PCA on cor(F0)
```{r cc_3dCorr,echo=TRUE, eval=TRUE, fig.cap=fig$cap("rgl1","PCA 1-3 on untransformed correlation matrix")}
pca <- pcaL[[1]]$x
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s',col=ccol3[ford], size=1,
xlab = "PC1", ylab = "PC2", zlab = "PC3")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3],abbreviate(rownames(pca)), col=ccol3[ford], adj=c(0,2))
title3d(main = "PCA on cor(F0)")
subid <- currentSubscene3d()
rglwidget(elementId="rgl-pca1",width=720,height=720)
```
```{r cc-pcaB1,h=7,w=7,fig.cap=fig$cap("pairs2","PCA of untransformed correlation matrix")}
pcaB <- prcomp(corrfB,center=TRUE, scale=TRUE)
pairs(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
```
```{r cc-pcaB2,h=7,w=7,fig.cap=fig$cap("pairs1","PC2 v PC1 of untransformed correlation matrix")}
plot(pcaB$x[,1:3], col=cr[bford],pch=20, cex=2)
text(pcaB$x[,1:2], labels=rownames(pcaB$x), pos=4, col=cr[bford])
```
```{r cc_3dCorrB, fig.cap=fig$cap("rglB","PCA 1-3 on untransformed correlation matrix")}
pcaB <- pcaB$x
rgl::plot3d(pcaB[,1],pcaB[,2],pcaB[,3],type='s',col=cr[bford], size=1)
rgl::rgl.texts(pcaB[,1],pcaB[,2],pcaB[,3],abbreviate(rownames(pcaB)), col=cr[bford], adj=c(0,2))
subid <- currentSubscene3d()
rglwidget(elementId="rgl-pcaB",width=720,height=720)
```
# LDA
## Scree Plots
First we will take a look at the scree plots for each of the primitives
along with their corresponding elbows as given by Zhu and Ghodsi 2006.
```{r cc-scree1, echo=FALSE, h=6, w=7, fig.cap=fig$cap("scree1","Scree plots for F0-F5")}
par(mfrow= c(2,3))
el2 <- lapply(pcaL, function(x) getElbows(x$sdev, plot = FALSE))
tmp <- mapply(function(x,y) {
getElbows(x$sdev, plot = TRUE)
title(y)
},
pcaL, paste0("Scree plot F", 0:5))
```
## Cumulative Variance plots
```{r cc-cumsum1, h=6, w=7, fig.cap=fig$cap("cumsum1","% Variance Explained for F0-F5")}
par(mfrow= c(2,3))
for(i in 1:length(pcaL)){
zs <- rep(1, dim(pcaL[[i]]$x)[2])
zb <- rep(0, dim(pcaL[[i]]$x)[2])
zs[el2[[i]]] <- 2
zb[el2[[i]]] <- 2
plot(100 * cumsum(pcaL[[i]]$sdev) / sum(pcaL[[i]]$sdev),
axes = TRUE,
type='b', pch = 21,
bg = zb, col = zs,
xlab = 'd', ylab = '% Var',
main = paste0('% Var Explained for F', i - 1))
axis(side = 1, at = el2[[i]], padj = 1.25)
}
```
```{r cc-lda1}
dat <- lapply(pcaL, function(x) data.frame(x$x))
type <- truth <- factor(exType, ordered = FALSE)
tr <- as.numeric(truth)
lda.fit <- list()
lda.fit[[1]] <- lda(type ~ ., data = dat[[1]][,1:el2[[1]][2]])
lda.fit[[2]] <- lda(type ~ ., data = dat[[2]][,1:el2[[2]][2]])
lda.fit[[3]] <- lda(type ~ ., data = dat[[3]][,1:el2[[3]][2]])
lda.fit[[4]] <- lda(type ~ ., data = dat[[4]][,1:el2[[4]][2]])
lda.fit[[5]] <- lda(type ~ ., data = dat[[5]][,1:el2[[5]][2]])
lda.fit[[6]] <- lda(type ~ ., data = dat[[6]][,1:el2[[6]][1]])
lda.pred <- lapply(lda.fit, predict)
```
```{r cc-lda-cep}
Lm <-
foreach(i = 1:6, .combine=rbind) %:%
foreach(j = 1:24, .combine=rbind) %do% {
out <- NULL
try({
set.seed(12)
ldapred <- as.numeric(predict(lda(type ~ ., dat[[i]][1:j]))$class)
er <- 1/24 * sum(ldapred != as.numeric(truth))
out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1))
}, silent=TRUE)
out
}
rownames(Lm) <- NULL
Lm <- data.table(Lm)
Lcvm <-
foreach(i = 1:6, .combine=rbind) %:%
foreach(j = 1:24, .combine=rbind) %do% {
out <- NULL
try({
set.seed(12)
ldapred <- as.numeric((lda(type ~ ., dat[[i]][1:j], CV = TRUE))$class)
er <- 1/24 * sum(ldapred != as.numeric(truth))
out <- data.frame(Lhat = er, d = j, feat = as.factor(i - 1))
}, silent=TRUE)
out
}
rownames(Lcvm) <- NULL
Lcvm <- data.table(Lcvm)
Lm$CV <- "Without CV"
Lcvm$CV <- "With CV"
tmp <- data.table(rbind(Lm, Lcvm))
tmp$CV <- factor(tmp$CV)
tmp$feat <- factor(tmp$feat)
size <- foreach(i = 1:6, .combine=c) %do% {
size <- rep(1, 23)
size[el2[[i]]] <- 2
size
}
tmp$size <- factor(c(size,size))
levels(tmp$size) <- c("non-elbow", "elbow")
```
## Misclassification rates for LDA w/ and w/o CV.
```{r cc-cvPlot, h=6, w = 8, fig.cap=fig$cap("cvPlot","Lhat vs $d$ for FLD on F0-F5 with an h-line at chance")}
xvalVr <-
ggplot(tmp, aes(x = d, y = Lhat, color = feat)) +
facet_grid(. ~ CV) +
geom_line(alpha = 0.5) +
geom_point(alpha = 0.65, aes(size = size)) +
geom_hline(yintercept = 0.64236) +
ggtitle("FLD (LDA): xval vs. re-substitution")
print(xvalVr)
```
## Misclasifications vs $\hat d$.
```{r cc-lda-misclass, h=10, w=8, fig.cap = fig$cap("misclassLDAcv", "Misclassifications using LDA with CV for various $d$.")}
L <-
foreach(i = 1:6) %:%
foreach(j = 1:24, .combine = cbind) %do% {
out <- NULL
try({
set.seed(12)
ldapredcv <- as.numeric(lda(type ~ ., dat[[i]][1:j], CV = TRUE)$class)
out <- ldapredcv
}, silent=TRUE)
out
}
A <- lapply(L, function(x) { apply(x, 2, function(y) y != tr ) })
B <- lapply(A, function(x) { x[,1:10] } )
B <- lapply(B, function(x) {
rownames(x) <- channel[ford]
colnames(x) <- paste0('d', 1:dim(x)[2])
return(x)
})
par(mfrow = c(2,3))
corT <- paste0("Misclassifications ", "F", 0:5)
for(i in 1:6) {
corrplot(B[[i]], method='color', addgrid.col=1,
tl.col = ccol[ford],
mar=3*c(1,0,1,1.5))
title(corT[i])
}
```
The above plots show when LDA with cross-validation misclassifies each
datapoint. A misclassification is denoted by a filled block and columns
denote the embedding dimension $\hat d$.
```{r cc-voronoi,h=12,w=8,fig.cap=fig$cap("voronoi","Voronoi diagrams on class means from LDA on PCA of untransformed correlation matrices")}
titlesvor <- paste("LDA decision boundaries for", paste0("F", 0:5))
voronoidf <- lapply(lapply(lda.fit, '[[', 3), data.frame)
#voronoidf <- data.frame(x=lda.fit$means[,1],y=lda.fit$means[,2])
#This creates the voronoi line segments
par(mfrow = c(3,2))
for(i in 1:length(dat)){
plot(dat[[i]][,1:2], col=ccol3[ford], pch=20, cex=1.5)
title(titlesvor[i])
text(dat[[i]][,1:2], labels=rownames(dat[[i]]),
pos=ifelse(dat[[i]][,1]
[Back to Top][Data]