wrote:
There are 2 different Synap channels (2 different antibodies were
used), so that part is fine.
And 2 different VGluT1 channels (same antibody but done at different
times)
The NOS channel is the same, so count it as one even though it appears
twice. It is listed two times because it can be found at both excitatory
and inhibitory synapses. This is where your count of 25 comes, even
though there are 24 channels.
I would also add the 2 Synap channels to the Inhibitory presynaptic
category - there is supposed to be synapsin there, but at lower levels
compared to excitatory presynaptic category.
- Note: The order of the channels are given by line `227` in the `kohenen.m` file which can be found in the dropbox folder.
- `datatransSynap` and `Synap` have been augmented to `Synap_1` and `Synap_2` for clarity.
- `VGlut1` and `VGlut1` have been augmented to `VGlut1_t1` and `VGlut1_t2` to distinguish between the different times of collection (which are unknown).
## Potential Filtering
Following is a discussion on which subset of markers could be used as a
subset to explore.
> On Thu, Apr 14, 2016 at 3:05 AM, Kristina Micheva kmicheva@stanford.edu wrote:
>
>I suggest:
>Synap, VGluT1, VGluT2, psd, gad, vgat, gephyr,
>
>Or a bit bigger:
>Synap, VGluT1, VGluT2, psd, gad, vgat, gephyr, VGlut3, CB1
>
>> On Apr 12, 2016, at 9:54 AM, Jesse L. Patsolic studiojlp@gmail.com wrote:
>>
>> Kristina,
>>
>> Out of the markers available, which do you think are the best to use as a subset?
>>
This subset has not yet been explored.
```{r cc_data, eval=TRUE}
featFull <- fread("../Data/synapsinR_7thA.tif.Pivots.txt.2011Features.txt",showProgress=FALSE)
### Setting a seed and creating an index vector
### to select half of the data
set.seed(2^10)
half1 <- sample(dim(featFull)[1],dim(featFull)[1]/2)
feat <- featFull[half1,]
dim(feat)
channel <- c('Synap_1','Synap_2','VGlut1_t1','VGlut1_t2','VGlut2','Vglut3',
'psd','glur2','nmdar1','nr2b','gad','VGAT',
'PV','Gephyr','GABAR1','GABABR','CR1','5HT1A',
'NOS','TH','VACht','Synapo','tubuli','DAPI')
channel.type <- c('ex.pre','ex.pre','ex.pre','ex.pre','ex.pre','in.pre.small',
'ex.post','ex.post','ex.post','ex.post','in.pre','in.pre',
'in.pre','in.post','in.post','in.post','in.pre.small','other',
'ex.post','other','other','ex.post','none','none')
nchannel <- length(channel)
nfeat <- ncol(feat) / nchannel
ffchannel <- (factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
fchannel <- as.numeric(factor(channel.type,
levels= c("ex.pre","ex.post","in.pre","in.post","in.pre.small","other","none")
))
ford <- order(fchannel)
Syncol <- c("#197300","#5ed155","#660000","#cc0000","#ff9933","mediumblue","gold")
ccol <- Syncol[fchannel]
exType <- factor(c(rep("ex",11),rep("in",6),rep("other",7)),ordered=TRUE)
exCol<-exType;levels(exCol) <- c("#197300","#990000","mediumblue");
exCol <- as.character(exCol)
fname <- as.vector(sapply(channel,function(x) paste0(x,paste0("F",0:5))))
names(feat) <- fname
fcol <- rep(ccol, each=6)
mycol <- colorpanel(100, "purple", "black", "green")
mycol2 <- matlab.like(nchannel)
```
## Transformations: Considering only `f0` Integrated Brightness
We will consider only the `f0` (integrated brightness) features, and will transform the feature vector data by
scaling and filtering.
```{r cc_featF0, eval=TRUE}
f0 <- seq(1,ncol(feat),by=nfeat)
featF0 <- subset(feat, select=f0)
f01e3 <- 1e3*data.table(apply(X=featF0, 2, function(x){((x-min(x))/(max(x)-min(x)))}))
fs <- f01e3
### Taking log_10 on data + 1.
log1f <- log10(featF0 + 1)
slog1f <- data.table(scale(log1f, center=TRUE,scale=TRUE))
```
We now have the following data sets:
- `featF0`: The feature vector looking only at the integrated brightness features.
- `fs`: The feature vector scaled between $[0,1000]$.
- `logf1`: The feature vector, plus one, then $log_{10}$ is applied.
- `slog1f`: The feature vector, plus one, $log_{10}$, then scaled by
subtracting the mean and dividing by the sample standard deviation.
### Kernel Density Estimates of the marginals
```{r cc_kde1, eval=TRUE,echo=TRUE,cache=FALSE,w=18,h=12,fig.cap=fig$cap("logKDE","Kernel density estimates for each channel, on `log` data.")}
df <- melt(as.matrix(log1f))
names(df) <- c("ind","channel","value")
df$type <- factor(rep(ffchannel,each=dim(fs)[1]),levels=levels(ffchannel))
lvo <- c(1:5,7:10,19,22,11:16,6,17,18,20,21,23,24)
levels(df$channel)<-levels(df$channel)[lvo]
ts <- 22
gg1 <- ggplot(df, aes(x=value)) +
scale_color_manual(values=ccol[lvo]) +
scale_fill_manual(values=ccol[lvo]) +
geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
geom_density(aes(group=channel, color=channel),size=1.5) +
facet_wrap( ~ channel, scale='free', ncol=6) +
theme(plot.title=element_text(size=ts),
axis.title.x=element_text(size=ts),
axis.title.y=element_text(size=ts),
legend.title=element_text(size=ts),
legend.text=element_text(size=ts-2),
axis.text=element_text(size=ts-2),
strip.text=element_text(size=ts),
legend.position='none')+
ggtitle("Kernel Density Estimates of `log1f` data.")
print(gg1)
```
```{r cc_kde2, eval=FALSE,echo=FALSE,cache=FALSE,w=18,h=12,fig.cap=fig$cap("flogsKDE","Kernel density estimates for each channel, on `slog1f` data.")}
df2 <- melt(as.matrix(slog1f))
names(df2) <- c("ind","channel","value")
df2$type <- factor(rep(ffchannel,each=dim(slog1f)[1]),levels=levels(ffchannel))
lvo <- c(1:5,7:10,19,22,11:16,6,17,18,20,21,23,24)
levels(df2$channel)<-levels(df2$channel)[lvo]
ts <- 22
gg2 <- ggplot(df2, aes(x=value)) +
scale_color_manual(values=ccol[lvo]) +
scale_fill_manual(values=ccol[lvo]) +
geom_histogram(aes(y=..density..,group=channel,colour=channel),bins=100) +
geom_density(aes(group=channel, color=channel),size=1.5) +
facet_wrap( ~ channel, scale='free', ncol=6) +
theme(plot.title=element_text(size=ts),
axis.title.x=element_text(size=ts),
axis.title.y=element_text(size=ts),
legend.title=element_text(size=ts),
legend.text=element_text(size=ts-2),
axis.text=element_text(size=ts-2),
strip.text=element_text(size=ts),
legend.position='none')+
#ggtitle("Kernel Density Estimates of z-scored º log_10 º feature vector +1 data ")
ggtitle("Kernel Density Estimates of `slog1f` data.")
print(gg2)
```
## Correlations
```{r cc_corLog,w=5,h=5, eval=TRUE,fig.cap=fig$cap("corLOG","Correlation on log_10 data, reordered by synapse type.")}
tmp <- as.numeric(table(fchannel))
cmatslog1f <- cor(slog1f)
corrplot(cmatslog1f[ford,ford],method="color",tl.col=ccol[ford], tl.cex=0.8)
corrRect(tmp,col=Syncol,lwd=4)
```
### PCA on the Correlation Matrix
```{r pcaLog, eval=TRUE}
pcaLog <- prcomp(cmatslog1f,scale=TRUE, center=TRUE)
elLog <- getElbows(pcaLog$sdev, plot=FALSE)
```
We run K-means for $K=3$ on the PCA embedded in $\mathbb{R}^3$ of the correlation matrix.
```{r cc_Corkmeans,eval=TRUE,echo=TRUE}
K1 <- c(3) ## The set of K's.
## Run kmeans on the pca of the correlation matrix of the slog1f data
kvecCor <- foreach(i = K1) %dopar% {
set.seed(2^4 - 1)
kmeans(pcaLog$x[,1:3],centers=i)
}
```
### Plots of embeddings
```{r cc_2dEmb,h=6,w=11,fig.cap=fig$cap("pca2dEm", "2d Embeddings.")}
par(mfrow=c(1,2))
plot(pcaLog$x[,1:2],
col=ccol[ford],
pch=as.numeric(exType)+15,
cex=1.5,
xlim=c(min(pcaLog$x[,1])-0.2,max(pcaLog$x[,1])+0.7),
main="Embedding of PCA log_10 correlation data")
text(pcaLog$x[,1:2],label=abbreviate(rownames(pcaLog$x)),offset=1, pos=4)
plot(pcaLog$x[,1:2],
col=as.numeric(kvecCor[[1]]$cluster)+1,
pch=as.numeric(kvecCor[[1]]$cluster)-1,
cex=1.5,
xlim=c(min(pcaLog$x[,1])-0.2,max(pcaLog$x[,1])+0.7),
main="Embedding of PCA log_10 correlation data, \ncolor based on 3-means clustering.")
text(pcaLog$x[,1:2],col='black',label=abbreviate(rownames(pcaLog$x)),offset=1, pos=4)
```
```{r cc_3dLog}
pca <- pcaLog$x[,1:3]
rgl::plot3d(pca[,1],pca[,2],pca[,3],type='s',col=ccol[ford], size=1, main="Log")
rgl::rgl.texts(pca[,1],pca[,2],pca[,3], abbreviate(rownames(pca)), col=ccol[ford], adj=c(0,1.5))
subid <- currentSubscene3d()
rglwidget(elementId="plot3dLog", width=720, height=720)
```
## K-Means Level 1
Next we run K-means with $K=3$.
** Note that a seed is being set for the random initialization of K-means. **
```{r cc_kmeans,eval=TRUE,echo=TRUE}
K2 <- c(2) ## The set of K's.
## Run kmeans on the slog1f data
kvecslog1f <- foreach(i = K2) %dopar% {
set.seed(2^4 - 1)
kmeans(slog1f,centers=i)
}
```
### Heat maps: scaled data.
For the following we manualy choose 2 clusters.
```{r cc_agg, eval=TRUE}
## Formatting data for heatmap
aggslog1f <- aggregate(slog1f,by=list(lab=kvecslog1f[[1]]$cluster),FUN=mean)
aggslog1f <- as.matrix(aggslog1f[,-1])
rownames(aggslog1f) <- clusterFraction(kvecslog1f[[1]])
ford <- order(fchannel)
```
```{r cc_km1-heatmap,eval=FALSE,echo=FALSE,w=6,h=6,fig.cap=fig$cap("heat1", "Heatmap of the cluster means vs channels. Rows and columns are rearranged according to hclust.")}
heatmap.2(as.matrix(aggslog1f), trace="none",col=mycol,colCol=ccol,cexRow=0.8, keysize=1,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90,main="Heatmap of `slog1f` data") #
```
```{r cc_km2-heatmapSorted,eval=TRUE,w=6,h=6,fig.cap=fig$cap("heatFLOG1", "Heatmap of the cluster means vs channels. Rows and columns are rearranged according to synapse type.")}
heatmap.2(as.matrix(aggslog1f[,ford]),dendrogram='row',Colv=NA,trace="none", col=mycol,colCol=ccol[ford],cexRow=0.8, keysize=1.25,symkey=FALSE,symbreaks=FALSE,scale="none", srtCol=90,main="Heatmap of `slog1f` data.")
```
Percentage of data within cluster is presented on the right side of the heatmap.
# Exploring pair-wise relationships
```{r sampleData}
### Sampling to reduce size
set.seed(2^13 - 2)
s1 <- sample(dim(slog1f)[1],2.5e5)
dlog1f <- data.table(log1f[s1,])
```
## `GABABR`
```{r gabLat}
## re-formatting data for use in lattice
dlog1f2 <- data.table(stack(dlog1f, select=-GABABRF0))[,.(values)]
dlog1f2$GABABR <- dlog1f$GABABRF0
### Adding relationship factor variables
nd <- paste0("GABABR","~",abbreviate(channel[-which(channel=="GABABR")]))
dlog1f2$ind <- factor(rep(nd,each=dim(dlog1f)[1]), ordered=TRUE,levels=nd)
names(dlog1f2) <- c("x","y","g")
rg1 <- xyplot(y ~ x | g, data=dlog1f2,
as.table=TRUE,
colramp=BTC,
pch='.',
scales = list(y = list(relation = "free"),x = list(relation = "free")),
panel=function(x,y,...){
panel.hexbinplot(x,y,..., type='g')
panel.loess(x,y,col='red', lwd=2,...)
}
)
```
```{r rg1,eval=TRUE,echo=FALSE,w=10,h=10,fig.cap=fig$cap("rg1", "Lattice plot of pairwise regressions involving `GABABR`")}
print(rg1)
```
# ARI
```{r cc_f1, eval=TRUE, echo=FALSE}
avd <- function(pcaObj,elbow,truth){
###
### Given a pca object with an elbow as upper-bound
### compute the ARI of truth versus the kmeans clustering for each
### dimension from 1:elbow. Also compute a permutation test for the ARI
### for each dimension from 1:elbow
###
require(foreach)
MAX <- 1e4
kv <- foreach(i = 1:elbow) %do% {
set.seed(i*3 +2)
### kvec for ith dimension of PCA
k <- as.numeric(kmeans(pcaObj$x[,1:i],centers=K1)$cluster)
## ari of clustering vs truth
ari <- mclust::adjustedRandIndex(truth,k)
list(k=k,ari=ari)
}
out <- foreach(i = 1:elbow,.combine='rbind') %:%
foreach(j = 1:MAX,.combine='rbind') %do% {
set.seed(j*3 + 1)
### Calculate the permutation
y <- sample(kv[[i]]$k)
##E gets ari of permutation vs truth
data.frame(Embedding_Dimension=i,permARI=mclust::adjustedRandIndex(truth,y))
}
out$Embedding_Dimension <- as.factor(out$Embedding_Dimension)
clusterARI <- sapply(kv,'[[',2)
out$ari <- rep(clusterARI, each=MAX)
#L <- list(distARI=out,ari=clusterARI)
return(out)
}
```
Here we consider the channel types to be the "ground truth" and computer
the Adjusted Rand Index of between that and the output from k-means.
## Approximate permutation test.
```{r cc_truth}
levels(ffchannel) <- c(rep("ex", 2), rep("in", 2), rep("other", 3))
levels(ffchannel)
truth <- as.numeric(ffchannel)
```
```{r arislog1f,eval=FALSE,echo=FALSE}
kcl <- as.numeric(kvecCor[[1]]$cluster)
v1 <- mclust::adjustedRandIndex(truth,kcl)
ariCor <- foreach(j=1:1e5,.combine='c') %dopar%{
set.seed(j*2^8-3) # for reproducibility
y <- sample(kcl)
mclust::adjustedRandIndex(truth,y)
}
N <- length(ariCor)
(pval <- sum(ariCor >= v1)/N)
```
```{r cc_ariPlots,eval=FALSE,echo=FALSE,w=4,h=4, fig.cap=fig$cap("ariPlot","ARI plot")}
h1 <- hist(ariCor,xlim=c(min(ariCor),v1+v1/2), prob=TRUE, breaks='Sc',plot=TRUE,main=NULL, xlab="ARI")
title("Approx. permutation test.")
abline(v = v1, lwd=2, col='darkred')
text(median(h1$breaks),quantile(h1$density,.98),col='darkred',labels=mtext(bquote(hat(p)==.(pval))))
```
Making a data.table of the permutation data for ggplot.
```{r cc-ariVdim}
DT <- data.table(avd(pcaLog, elLog[2], truth),key='Embedding_Dimension')
DT <- DT[,pval := sum(permARI>=ari)/length(permARI),by=Embedding_Dimension]
ua <- DT[,unique(ari),by=Embedding_Dimension]
arid <- data.frame(Embedding_Dimension=as.numeric(ua$Emb),
ARI=ua$V1,
pval=DT[,unique(pval),by=Embedding_Dimension]$V1)
```
```{r cc-gg3, w=20,h=12, fig.cap=fig$cap("ariPerm","ARI Permutation Tests")}
gg3 <- ggplot(data=DT,aes(x=permARI, y=..density..,color=Embedding_Dimension,label=pval)) +
#geom_histogram(binwidth=3.49*sd(DT$permARI)*length(DT$permARI)^(-1/3)) +
geom_histogram(bins=25)+
geom_vline(aes(xintercept=ari),colour='darkred',size=1.2)+
#geom_text(aes(x=(ari-ari/2),y=7))+
theme(axis.text=element_text(size=18),
title=element_text(size=16),
strip.text.x=element_text(size=16)) +
facet_wrap(~Embedding_Dimension+pval,scale='free',labeller=label_both)
print(gg3)
```
```{r arivDim, w=12,h=8, fig.cap=fig$cap("ari_pvals","ARI and P-Values")}
gg5 <- ggplot(data=arid,aes(x=Embedding_Dimension,y=ARI,label=pval,colour=pval)) +
geom_line(size=1.5,colour='salmon') + geom_point(size=3) +
#geom_text(hjust='right',vjust='top',nudge_x=0.5,nudge_y=0.01,size=5)+
theme(axis.text=element_text(size=18),
title=element_text(size=16)) +
ggtitle("ARI vs. DIM with estimated p-values")
gg6 <-
ggplot(data=arid, aes(x=Embedding_Dimension, y=pval, colour=pval))+
geom_line(size=1.5, colour='salmon') +
geom_point(size=3) +
# geom_hline(yintercept=0.05,
# colour='forestgreen',
# size=1.5) +
scale_y_log10(breaks=c(1e-4,1.53-4,1e-3), na.value=0) +
theme(axis.text=element_text(size=18),
title=element_text(size=16)) +
ggtitle("P-values")
grid.arrange(gg5,gg6,nrow=2)
```