---
title: "Comparison of HVGs"
subtitle: "{{< meta subtitle_seurat >}} {{< meta subtitle_bioc >}} {{< meta subtitle_scanpy >}}"
description: "Overview of all three pipeline results."
format: html
---


{{< meta qc_data_2 >}}

```{r}
#| label: libraries
suppressPackageStartupMessages({
    library(Seurat)
    library(zellkonverter)
    library(Matrix)
    library(ggplot2)
    library(patchwork)
    library(scran)
    library(ComplexHeatmap)
    library(basilisk)
})

devtools::source_url("https://raw.githubusercontent.com/asabjorklund/single_cell_R_scripts/main/overlap_phyper_v2.R")

```


## Load data


OBS! Zellkonverter installs conda env with basilisk! Takes a while to run first time!!


```{r}
#| label: load
path_results <- "data/covid/results"
if (!dir.exists(path_results)) dir.create(path_results, recursive = T)

path_seurat = "../seurat/data/covid/results/"
path_bioc = "../bioc/data/covid/results/"
path_scanpy = "../scanpy/data/covid/results/"

path_results <- "data/covid/results"
if (!dir.exists(path_results)) dir.create(path_results, recursive = T)

path_seurat = "../seurat/data/covid/results/"
path_bioc = "../bioc/data/covid/results/"
path_scanpy = "../scanpy/data/covid/results/"

# fetch the files with qc and dimred for each 

# seurat
sobj = readRDS(file.path(path_seurat,"seurat_covid_qc_dr_int_cl.rds"))

# bioc
sce = readRDS(file.path(path_bioc,"bioc_covid_qc_dr_int_cl.rds"))
bioc = as.Seurat(sce)

# scanpy
scanpy.sce = readH5AD(file.path(path_scanpy, "scanpy_covid_qc_dr_scanorama_cl.h5ad"))
scanpy = as.Seurat(scanpy.sce, counts = NULL, data = "X") # only have the var.genes data that is scaled.
```

## Umaps

```{r}
#| label: umaps
#| fig-height: 4
#| fig-width: 10

wrap_plots(
    DimPlot(sobj, group.by = "orig.ident") + NoAxes() + ggtitle("Seurat"),
    DimPlot(bioc, group.by = "sample") + NoAxes() + ggtitle("Bioc"),
    DimPlot(scanpy, group.by = "sample", reduction = "X_umap_uncorr") + NoAxes() + ggtitle("Scanpy"),
    ncol = 3
)
```

Create one dataset with the cells that are present in all samples. Also add in umap from all 3 pipelines.

```{r}
#| label: make-object
meta.seurat = sobj@meta.data
meta.scanpy = scanpy@meta.data
meta.bioc = bioc@meta.data

meta.bioc$cell = rownames(meta.bioc)
meta.scanpy$cell = sapply(rownames(meta.scanpy), function(x) substr(x,1,nchar(x)-2))
meta.seurat$cell = unlist(lapply(strsplit(rownames(meta.seurat),"_"), function(x) x[3]))
```


```{r}
#| label: make-objects2
in.all = intersect(intersect(meta.scanpy$cell, meta.seurat$cell), meta.bioc$cell)

tmp1 = meta.bioc[match(in.all, meta.bioc$cell),]
colnames(tmp1) = paste0(colnames(tmp1),"_bioc")
tmp2 = meta.scanpy[match(in.all, meta.scanpy$cell),]
colnames(tmp2) = paste0(colnames(tmp2),"_scpy")

all = sobj[,match(in.all, meta.seurat$cell)]

meta.all = cbind(all@meta.data, tmp1,tmp2)
all@meta.data = meta.all

Reductions(all)

tmp = bioc@reductions$UMAP_on_PCA@cell.embeddings[match(in.all, meta.bioc$cell),]
rownames(tmp) = colnames(all)
all[["umap_bioc"]] = CreateDimReducObject(tmp, key = "umapbioc_", assay = "RNA")
tmp = scanpy@reductions$X_umap_uncorr@cell.embeddings[match(in.all, meta.scanpy$cell),]
rownames(tmp) = colnames(all)
all[["umap_scpy"]] = CreateDimReducObject(tmp, key = "umapscpy_", assay = "RNA")

Reductions(all)
```

## Variable features

In Seurat, `FindVariableFeatures` is not batch aware unless the data is split into layers by samples, here we have the variable genes created with layers. In Bioc `modelGeneVar` we used sample as a blocking parameter, e.g calculates the variable genes per sample and combines the variances. Similar in scanpy we used the samples as `batch_key`. 


```{r}
#| label: hvg
hvgs = list()
hvgs$seurat = VariableFeatures(sobj)
hvgs$bioc = sce@metadata$hvgs
# scanpy has no strict number on selection, instead uses dispersion cutoff. So select instead top 2000 dispersion genes.
scpy.hvg = rowData(scanpy.sce)
hvgs_scanpy = rownames(scpy.hvg)[scpy.hvg$highly_variable]
hvgs$scanpy = rownames(scpy.hvg)[order(scpy.hvg$dispersions_norm, decreasing = T)][1:2000]

cmat = make_comb_mat(hvgs)
print(cmat)

UpSet(cmat)
```
Surprisingly low overlap between the methods and many genes that are unique to one pipeline. With discrepancies in the doublet filtering the cells used differ to some extent, but otherwise the variation should be similar. Even if it is estimated in different ways.

Is the differences more due to the combination of ranks/dispersions or also found within a single dataset?

Only Seurat have the dispersions for each individual dataset stored in the object. 

### Compare dispersions

Recalculate for bioc.

```{r}
#| label: bioc
var.out <- modelGeneVar(sce, block = sce$sample)
hvgs_bioc <- getTopHVGs(var.out, n = 2000)
cutoff <- rownames(var.out) %in% hvgs_bioc
```


```{r}
#| label: dispersions
#| fig-height: 10
#| fig-width: 12

# Merge all hvg info

all.genes = intersect(rownames(var.out), rownames(scpy.hvg))

scpy.hvg = rowData(scanpy.sce)
colnames(scpy.hvg) = paste0(colnames(scpy.hvg), "_scpy")
colnames(var.out) = paste0(colnames(var.out), "_bioc")

seu.hvg = HVFInfo(all) 

all.hvg = cbind(seu.hvg[all.genes,], scpy.hvg[all.genes,], var.out[all.genes,])


par(mfrow=c(2,2))
plot(all.hvg$means_scpy, all.hvg$dispersions_norm_scpy, main = "Scanpy",pch = 16, cex = 0.4)
points(all.hvg$means_scpy[all.hvg$highly_variable_scpy], all.hvg$dispersions_norm_scpy[all.hvg$highly_variable_scpy], col="red",pch = 16, cex = 0.4)

plot(all.hvg$mean_bioc, all.hvg$bio_bioc, pch = 16, cex = 0.4, main = "Bioc")
points(all.hvg$mean_bioc[rownames(all.hvg) %in% hvgs$bioc], all.hvg$bio_bioc[rownames(all.hvg) %in% hvgs$bioc], col = "red", pch = 16, cex = .6)

   
plot(log1p(all.hvg$mean), all.hvg$variance.standardized, main = "Seurat",pch = 16, cex = 0.4)    
points(log1p(all.hvg$mean)[match(hvgs$seurat, rownames(all.hvg))], all.hvg$variance.standardized[match(hvgs$seurat, rownames(all.hvg))],col="red",pch = 16, cex = 0.4)    


```

Scanpy uses min_disp=0.5, min_mean=0.0125, max_mean=3, so the highly expressed ones are not included.

Seurat uses `variance.standardized` to rank the genes. Bioc uses the `bio` slot with estimated biological variation.   


```{r}
#| label: pairs-plot
all.hvg$mean_log = log1p(all.hvg$mean)
sel.means = c("mean","mean_log","means_scpy", "mean_bioc")
pairs(all.hvg[,sel.means])

sel.disp = c("variance.standardized","dispersions_norm_scpy", "bio_bioc","total_bioc")
pairs(all.hvg[,sel.disp])
```

Difference in what cells were used, and also in how the dispersions across the samples is combined into one value. Do for just one sample instead.

## For one sample


Run hvg selection for ctrl.13 sample, so that the handling of batches is not influencing results. Use default settings in all the different methods.

### Seurat

Try the different methods implemented in Seurat. From the help section:

```
* “vst”: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
* “mean.var.plot” (mvp): First, uses a function to calculate average expression (mean.function) and dispersion (dispersion.function) for each feature. Next, divides features into num.bin (deafult 20) bins based on their average expression, and calculates z-scores for dispersion within each bin. The purpose of this is to identify variable features while controlling for the strong relationship between variability and average expression
* “dispersion” (disp): selects the genes with the highest dispersion values
```

Feature selection for individual datasets

In each dataset, we next aimed to identify a subset of features (e.g., genes) exhibiting high variability across cells, and therefore represent heterogeneous features to prioritize for downstream analysis. Choosing genes solely based on their log-normalized single-cell variance fails to account for the mean-variance relationship that is inherent to single-cell RNA-seq. Therefore, we first applied a variance-stabilizing transformation to correct for this [Mayer et al., 2018, Hafemeister and Satija, 2019].
To learn the mean-variance relationship from the data, we computed the mean and variance of each gene using the unnormalized data (i.e., UMI or counts matrix), and applied 
-transformation to both. We then fit a curve to predict the variance of each gene as a function of its mean, by calculating a local fitting of polynomials of degree 2 (R function loess, span = 0.3). This global fit provided us with a regularized estimator of variance given the mean of a feature. As such, we could use it to standardize feature counts without removing higher-than-expected variation.


```{r}
#| label: seurat
ctrl = all[,all$orig.ident == "ctrl_13"]
ctrl@assays$RNA@meta.data[,1:ncol(ctrl@assays$RNA@meta.data)] = NULL

ctrl = FindVariableFeatures(ctrl)
hvg_seu = list()
hvg_seu$vst = VariableFeatures(ctrl)
#top20 <- head(hvg_seu$vst, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)


ctrl = FindVariableFeatures(ctrl, selection.method = "mean.var.plot")
hvg_seu$mvp = VariableFeatures(ctrl)
#top20 <- head(hvg_seu$mvp, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)

ctrl = FindVariableFeatures(ctrl, selection.method = "dispersion")
hvg_seu$disp = VariableFeatures(ctrl)
#top20 <- head(hvg_seu$disp, 20)
#LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE)

cmat = make_comb_mat(hvg_seu)
cmat
```

Very little overlap between the methods in Seurat. Mvp and disp are calculated on `data`, while vst is done on `counts`. 


```{r}
#| label: plot-seurat
vinfo.seu = ctrl@assays$RNA@meta.data
rownames(vinfo.seu) = rownames(ctrl)

vinfo.seu$vg_vst = rownames(vinfo.seu) %in% hvg_seu$vst
vinfo.seu$vg_disp = rownames(vinfo.seu) %in% hvg_seu$disp
vinfo.seu$vg_mvp = rownames(vinfo.seu) %in% hvg_seu$mvp
vinfo.seu$vg_vst_disp = vinfo.seu$vg_vst + vinfo.seu$vg_disp == 2
vinfo.seu$hvinfo = ifelse(vinfo.seu$vg_vst, "VST",NA)
vinfo.seu$hvinfo[vinfo.seu$vg_mvp] = "MVP"
vinfo.seu$hvinfo[vinfo.seu$vg_disp] = "DISP"
vinfo.seu$hvinfo[vinfo.seu$vg_vst_disp] = "VST/DISP"


means = colnames(vinfo.seu)[grepl("mean", colnames(vinfo.seu))]
disp = c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","vf_mvp_data.ctrl_13_mvp.dispersion.scaled")
pairs(vinfo.seu[,means])


plot_hvg = function(df, m,d,vg, log=FALSE){
  top20 = head(vg,20)
  p = ggplot(df, aes(x=.data[[m]], y=.data[[d]], color=hvinfo)) + geom_point() + theme_classic() 
  if (log) { p = p + scale_x_log10()}
  LabelPoints(plot = p, points = top20, repel = TRUE)
}

p1S =   plot_hvg(vinfo.seu, "vf_vst_counts.ctrl_13_mean", "vf_vst_counts.ctrl_13_variance.standardized",hvg_seu$vst, log=TRUE)
p2S =   plot_hvg(vinfo.seu, "vf_disp_data.ctrl_13_mvp.mean", "vf_disp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$disp)
p3S =   plot_hvg(vinfo.seu, "vf_mvp_data.ctrl_13_mvp.mean", "vf_mvp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$mvp)
wrap_plots(p1S,p2S,p3S,   ncol=2)


```


### Bioc

Run the bioconductor detection method with `modelGeneVar` and `getTopHVGs`. Plot both total variance and "bio" variance.


```{r}
#| label: bioc2
ctrl.sce = as.SingleCellExperiment(ctrl)
var.out <- modelGeneVar(ctrl.sce)
hvgs_bioc <- getTopHVGs(var.out, n = 2000)

var.out.df = data.frame(var.out)
var.out.df$hvg = rownames(var.out) %in% hvgs_bioc
p = ggplot(var.out.df, aes(x=mean, y=total, colour = hvg)) + geom_point() + theme_classic()

top20 <- head(hvgs_bioc, 20)
pB = LabelPoints(plot = p, points = top20, repel = TRUE)
pB


p2 = ggplot(var.out.df, aes(x=mean, y=bio, colour = hvg)) + geom_point() + theme_classic()

top20 <- head(hvgs_bioc, 20)
pB2 = LabelPoints(plot = p2, points = top20, repel = TRUE)
pB2

```

### Scanpy

Run with both `seurat` (on lognorm data) and `seurat_v3` (on counts, is same as vst)


For the dispersion-based methods (flavor='seurat' Satija et al. [2015] and flavor='cell_ranger' Zheng et al. [2017]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected.

For flavor='seurat_v3'/'seurat_v3_paper' [Stuart et al., 2019], a normalized variance for each gene is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. Only if batch_key is not None, the two flavors differ: For flavor='seurat_v3', genes are first sorted by the median (across batches) rank, with ties broken by the number of batches a gene is a HVG. For flavor='seurat_v3_paper', genes are first sorted by the number of batches a gene is a HVG, with ties broken by the median (across batches) rank.


```{r}
#| label: scanpy
penv = "/Users/asabjor/miniconda3/envs/scanpy_2024_nopip"
hvg.scanpy = basiliskRun(env=penv, fun=function(counts) {
    scanpy <- reticulate::import("scanpy")
    ad = reticulate::import("anndata")
    adata = ad$AnnData(counts)
    print(adata$X[1:10,1:10])
    var1 = scanpy$pp$highly_variable_genes(adata, flavor = "seurat_v3", inplace=FALSE)
    scanpy$pp$normalize_per_cell(adata, counts_per_cell_after=1e4)
    scanpy$pp$log1p(adata)
    print(adata$X[1:10,1:10])
    scanpy$pp$highly_variable_genes(adata)
    return(list(disp=adata$var, vst=var1))
    
}, counts = t(ctrl@assays$RNA@layers$counts.ctrl_13),  testload="scanpy")


#flavor
#Literal['seurat', 'cell_ranger', 'seurat_v3', 'seurat_v3_paper'] (default: 'seurat')
```


```{r}
#| label: scanpy2
rownames(hvg.scanpy$disp) = rownames(ctrl)
rownames(hvg.scanpy$vst) = rownames(ctrl)


top20 <- head(rownames(hvg.scanpy$disp)[order(hvg.scanpy$disp$dispersions_norm, decreasing = T)], 20)
p = ggplot(hvg.scanpy$disp, aes(x=means, y=dispersions_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy disp")

pS = LabelPoints(plot = p, points = top20, repel = TRUE)
pS 

top20 <- head(rownames(hvg.scanpy$vst)[order(hvg.scanpy$vst$variances_norm, decreasing = T)], 20)
p2 = ggplot(hvg.scanpy$vst, aes(x=means, y=variances_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy vst") + scale_x_log10()

pS2 = LabelPoints(plot = p2, points = top20, repel = TRUE)
pS2

```

### All together

```{r}
#| label: merge-all
colnames(hvg.scanpy$disp) = paste0(colnames(hvg.scanpy$disp), "_scpyD")
colnames(hvg.scanpy$vst) = paste0(colnames(hvg.scanpy$vst), "_scpyV")
colnames(var.out) = paste0(colnames(var.out), "_bioc")
ctrl.hvg = cbind(vinfo.seu, var.out, hvg.scanpy$disp, hvg.scanpy$vst)
```

```{r}
#| label: plot-all
#| fig-height: 10
#| fig-width: 12


wrap_plots(p1S + ggtitle("Seurat vst"),p2S + ggtitle("Seurat disp"),p3S + ggtitle("Seurat mvp"),pB + ggtitle("Bioc total"),pS,pS2, ncol=3)
 
```


```{r}
#| label: disps-all
sel = c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","bio_bioc", "dispersions_norm_scpyD", "variances_norm_scpyV")

pairs(ctrl.hvg[,sel])
```

```{r}
#| label: means-all
sel = c("vf_vst_counts.ctrl_13_mean_log","vf_disp_data.ctrl_13_mvp.mean", "mean_bioc", "means_scpyD", "means_scpyV_log")

ctrl.hvg$vf_vst_counts.ctrl_13_mean_log = log1p(ctrl.hvg$vf_vst_counts.ctrl_13_mean)
ctrl.hvg$means_scpyV_log = log1p(ctrl.hvg$means_scpyV)

pairs(ctrl.hvg[,sel])
```


```{r}
#| label: overlap

hvgs = hvg_seu
hvgs$scpyD = rownames(hvg.scanpy$disp)[hvg.scanpy$disp$highly_variable_scpyD]
hvgs$scpyV = rownames(hvg.scanpy$disp)[hvg.scanpy$vst$highly_variable_scpyV]
hvgs$bioc = hvgs_bioc

o = overlap_phyper2(hvgs,hvgs, remove.diag = T)
```

Mean Bioc stands out the most. 

Vst in seurat and scanpy is identical and very different from the other methods. But it is also based on counts instead of lognorm data. 

The variance estimate for the same sample is quite different. Even the top variable genes are not the same and are very different gene groups.


### Top genes

Explore top  genes with the different methods. 

disp and mvp are identical, remove mvp.

ScanpyV and vst are identical, remove scpyV.

```{r}
#| label: top10
topG = lapply(hvgs, head, 10)

# same for mvp and disp, remove one of them.
topG$mvp = NULL
topG$scpyV = NULL

# sort the genes for scanpy.
topG$scpyD = rownames(ctrl.hvg)[order(ctrl.hvg$dispersions_norm_scpyD, decreasing = T)][1:10]


# rank the genes
allG = unique(unlist(topG))


ranks = Reduce(rbind, lapply(topG, function(x) {
  r = 1:10
  names(r) = x
  r2 = r[allG]
  return(r2)
}))

colnames(ranks) = allG
rownames(ranks) = names(topG)

pheatmap::pheatmap(ranks, display_numbers = ranks, main="Rank of top 10 gens", cluster_rows = F, cluster_cols = F)


```

Plot onto umap for the unique ones.

```{r}
#| label: top10-umap
#| fig-height: 10
#| fig-width: 12

topG$disp = NULL

t = table(unlist(topG))
selG = names(t)[t==1]
selG = sapply(selG, function(y) names(which(unlist(lapply(topG, function(x) any(x==y))))))
selG = sort(selG)

#small.leg <- theme(legend.text = element_text(size=3), legend.key.size = unit(0.5,"point"))

plots = list()
for (g in names(selG)){
  plots[[g]] = FeaturePlot(ctrl, reduction = "umap_harmony", features = g, order = T)  + ggtitle(paste(g,selG[g], collapse = " - ")) + NoAxes() #+ small.leg
}

wrap_plots(plots, ncol = 4)

```

More clear celltype genes in the Bioc selection, but also higher expressed genes.. More B-cell genes for vst.


### Expression levels

Expression levels of the variable genes. As total counts or mean expression across all cells.

```{r}
#| label: exprs
#| fig-height: 8
#| fig-width: 12

ctrl.hvg$nC = rowSums(ctrl@assays$RNA@layers$counts.ctrl_13>0)
ctrl.hvg$meanE = rowMeans(ctrl@assays$RNA@layers$data.ctrl_13)
ctrl.hvg$vg_bioc = rownames(ctrl.hvg) %in% hvgs_bioc

wrap_plots(
ggplot(ctrl.hvg, aes(x=nC, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"),
ggplot(ctrl.hvg, aes(x=nC, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"), 
ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"),
ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"),
ggplot(ctrl.hvg, aes(x=nC, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Bioc"),

ggplot(ctrl.hvg, aes(x=meanE, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"), 
ggplot(ctrl.hvg, aes(x=meanE, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"), 
ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"), 
ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"), 
ggplot(ctrl.hvg, aes(x=meanE, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend()+ ggtitle("Bioc"), 
ncol =5
)


```

BioC distibution shifted to more highly expressed genes. 

Dispersion gives the very highly expressed genes, but also has a spread among all levels of expression.

## Discussion

* Seurat v3 paper suggests vst on counts, but then uses the variable genes in lognorm space, how is this the best option_ 

* From SC best practices, suggests scry for HVG selection. https://www.sc-best-practices.org/preprocessing_visualization/feature_selection.html

### scry

Run scry deviance function and select top 2k genes. 

```{r}
#| label: scry
ctrl.sce<-scry::devianceFeatureSelection(ctrl.sce, assay="counts", sorted=TRUE)
plot(rowData(ctrl.sce)$binomial_deviance, type="l", xlab="ranked genes",
     ylab="binomial deviance", main="Feature Selection with Deviance")
abline(v=2000, lty=2, col="red")

head(rowData(ctrl.sce),10)
```

Top genes are similar to the other methods.


```{r}
#| label: scry-plot
ctrl.hvg$deviance = rowData(ctrl.sce)$binomial_deviance[match(rownames(ctrl.hvg), rownames(rowData(ctrl.sce)))]
ctrl.hvg$devG = rownames(ctrl.hvg) %in% head(rownames(rowData(ctrl.sce)),2000)

wrap_plots(
ggplot(ctrl.hvg, aes(x=nC, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("nCell vs deviance") + NoLegend(), 
ggplot(ctrl.hvg, aes(x=meanE, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("mean expression vs deviance")+ NoLegend(), 
ggplot(ctrl.hvg, aes(x=dispersions_norm_scpyD, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("Dispersion vs deviance")+ NoLegend(), 
ggplot(ctrl.hvg, aes(x=variances_norm_scpyV, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("VST vs deviance")+ NoLegend(), 
ggplot(ctrl.hvg, aes(x=bio_bioc, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("bioc bio vs deviance")+ NoLegend(), 


ncol = 3
)

```
Selects all genes with high expression

```{r}
#| label: scry-overlap
hvgs$deviance = head(rownames(rowData(ctrl.sce)),2000)
o = overlap_phyper2(hvgs,hvgs, remove.diag = T)
```


## {{< meta session >}}

<details>
  <summary>Click here</summary>

```{r}
#| label: session
sessionInfo()
```

</details>