--- title: "Comparison of HVGs" subtitle: "{{< meta subtitle_seurat >}} {{< meta subtitle_bioc >}} {{< meta subtitle_scanpy >}}" description: "Overview of all three pipeline results." format: html --- {{< meta qc_data_2 >}} ```{r} #| label: libraries suppressPackageStartupMessages({ library(Seurat) library(zellkonverter) library(Matrix) library(ggplot2) library(patchwork) library(scran) library(ComplexHeatmap) library(basilisk) }) devtools::source_url("https://raw.githubusercontent.com/asabjorklund/single_cell_R_scripts/main/overlap_phyper_v2.R") ``` ## Load data OBS! Zellkonverter installs conda env with basilisk! Takes a while to run first time!! ```{r} #| label: load path_results <- "data/covid/results" if (!dir.exists(path_results)) dir.create(path_results, recursive = T) path_seurat = "../seurat/data/covid/results/" path_bioc = "../bioc/data/covid/results/" path_scanpy = "../scanpy/data/covid/results/" path_results <- "data/covid/results" if (!dir.exists(path_results)) dir.create(path_results, recursive = T) path_seurat = "../seurat/data/covid/results/" path_bioc = "../bioc/data/covid/results/" path_scanpy = "../scanpy/data/covid/results/" # fetch the files with qc and dimred for each # seurat sobj = readRDS(file.path(path_seurat,"seurat_covid_qc_dr_int_cl.rds")) # bioc sce = readRDS(file.path(path_bioc,"bioc_covid_qc_dr_int_cl.rds")) bioc = as.Seurat(sce) # scanpy scanpy.sce = readH5AD(file.path(path_scanpy, "scanpy_covid_qc_dr_scanorama_cl.h5ad")) scanpy = as.Seurat(scanpy.sce, counts = NULL, data = "X") # only have the var.genes data that is scaled. ``` ## Umaps ```{r} #| label: umaps #| fig-height: 4 #| fig-width: 10 wrap_plots( DimPlot(sobj, group.by = "orig.ident") + NoAxes() + ggtitle("Seurat"), DimPlot(bioc, group.by = "sample") + NoAxes() + ggtitle("Bioc"), DimPlot(scanpy, group.by = "sample", reduction = "X_umap_uncorr") + NoAxes() + ggtitle("Scanpy"), ncol = 3 ) ``` Create one dataset with the cells that are present in all samples. Also add in umap from all 3 pipelines. ```{r} #| label: make-object meta.seurat = sobj@meta.data meta.scanpy = scanpy@meta.data meta.bioc = bioc@meta.data meta.bioc$cell = rownames(meta.bioc) meta.scanpy$cell = sapply(rownames(meta.scanpy), function(x) substr(x,1,nchar(x)-2)) meta.seurat$cell = unlist(lapply(strsplit(rownames(meta.seurat),"_"), function(x) x[3])) ``` ```{r} #| label: make-objects2 in.all = intersect(intersect(meta.scanpy$cell, meta.seurat$cell), meta.bioc$cell) tmp1 = meta.bioc[match(in.all, meta.bioc$cell),] colnames(tmp1) = paste0(colnames(tmp1),"_bioc") tmp2 = meta.scanpy[match(in.all, meta.scanpy$cell),] colnames(tmp2) = paste0(colnames(tmp2),"_scpy") all = sobj[,match(in.all, meta.seurat$cell)] meta.all = cbind(all@meta.data, tmp1,tmp2) all@meta.data = meta.all Reductions(all) tmp = bioc@reductions$UMAP_on_PCA@cell.embeddings[match(in.all, meta.bioc$cell),] rownames(tmp) = colnames(all) all[["umap_bioc"]] = CreateDimReducObject(tmp, key = "umapbioc_", assay = "RNA") tmp = scanpy@reductions$X_umap_uncorr@cell.embeddings[match(in.all, meta.scanpy$cell),] rownames(tmp) = colnames(all) all[["umap_scpy"]] = CreateDimReducObject(tmp, key = "umapscpy_", assay = "RNA") Reductions(all) ``` ## Variable features In Seurat, `FindVariableFeatures` is not batch aware unless the data is split into layers by samples, here we have the variable genes created with layers. In Bioc `modelGeneVar` we used sample as a blocking parameter, e.g calculates the variable genes per sample and combines the variances. Similar in scanpy we used the samples as `batch_key`. ```{r} #| label: hvg hvgs = list() hvgs$seurat = VariableFeatures(sobj) hvgs$bioc = sce@metadata$hvgs # scanpy has no strict number on selection, instead uses dispersion cutoff. So select instead top 2000 dispersion genes. scpy.hvg = rowData(scanpy.sce) hvgs_scanpy = rownames(scpy.hvg)[scpy.hvg$highly_variable] hvgs$scanpy = rownames(scpy.hvg)[order(scpy.hvg$dispersions_norm, decreasing = T)][1:2000] cmat = make_comb_mat(hvgs) print(cmat) UpSet(cmat) ``` Surprisingly low overlap between the methods and many genes that are unique to one pipeline. With discrepancies in the doublet filtering the cells used differ to some extent, but otherwise the variation should be similar. Even if it is estimated in different ways. Is the differences more due to the combination of ranks/dispersions or also found within a single dataset? Only Seurat have the dispersions for each individual dataset stored in the object. ### Compare dispersions Recalculate for bioc. ```{r} #| label: bioc var.out <- modelGeneVar(sce, block = sce$sample) hvgs_bioc <- getTopHVGs(var.out, n = 2000) cutoff <- rownames(var.out) %in% hvgs_bioc ``` ```{r} #| label: dispersions #| fig-height: 10 #| fig-width: 12 # Merge all hvg info all.genes = intersect(rownames(var.out), rownames(scpy.hvg)) scpy.hvg = rowData(scanpy.sce) colnames(scpy.hvg) = paste0(colnames(scpy.hvg), "_scpy") colnames(var.out) = paste0(colnames(var.out), "_bioc") seu.hvg = HVFInfo(all) all.hvg = cbind(seu.hvg[all.genes,], scpy.hvg[all.genes,], var.out[all.genes,]) par(mfrow=c(2,2)) plot(all.hvg$means_scpy, all.hvg$dispersions_norm_scpy, main = "Scanpy",pch = 16, cex = 0.4) points(all.hvg$means_scpy[all.hvg$highly_variable_scpy], all.hvg$dispersions_norm_scpy[all.hvg$highly_variable_scpy], col="red",pch = 16, cex = 0.4) plot(all.hvg$mean_bioc, all.hvg$bio_bioc, pch = 16, cex = 0.4, main = "Bioc") points(all.hvg$mean_bioc[rownames(all.hvg) %in% hvgs$bioc], all.hvg$bio_bioc[rownames(all.hvg) %in% hvgs$bioc], col = "red", pch = 16, cex = .6) plot(log1p(all.hvg$mean), all.hvg$variance.standardized, main = "Seurat",pch = 16, cex = 0.4) points(log1p(all.hvg$mean)[match(hvgs$seurat, rownames(all.hvg))], all.hvg$variance.standardized[match(hvgs$seurat, rownames(all.hvg))],col="red",pch = 16, cex = 0.4) ``` Scanpy uses min_disp=0.5, min_mean=0.0125, max_mean=3, so the highly expressed ones are not included. Seurat uses `variance.standardized` to rank the genes. Bioc uses the `bio` slot with estimated biological variation. ```{r} #| label: pairs-plot all.hvg$mean_log = log1p(all.hvg$mean) sel.means = c("mean","mean_log","means_scpy", "mean_bioc") pairs(all.hvg[,sel.means]) sel.disp = c("variance.standardized","dispersions_norm_scpy", "bio_bioc","total_bioc") pairs(all.hvg[,sel.disp]) ``` Difference in what cells were used, and also in how the dispersions across the samples is combined into one value. Do for just one sample instead. ## For one sample Run hvg selection for ctrl.13 sample, so that the handling of batches is not influencing results. Use default settings in all the different methods. ### Seurat Try the different methods implemented in Seurat. From the help section: ``` * “vst”: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter). * “mean.var.plot” (mvp): First, uses a function to calculate average expression (mean.function) and dispersion (dispersion.function) for each feature. Next, divides features into num.bin (deafult 20) bins based on their average expression, and calculates z-scores for dispersion within each bin. The purpose of this is to identify variable features while controlling for the strong relationship between variability and average expression * “dispersion” (disp): selects the genes with the highest dispersion values ``` Feature selection for individual datasets In each dataset, we next aimed to identify a subset of features (e.g., genes) exhibiting high variability across cells, and therefore represent heterogeneous features to prioritize for downstream analysis. Choosing genes solely based on their log-normalized single-cell variance fails to account for the mean-variance relationship that is inherent to single-cell RNA-seq. Therefore, we first applied a variance-stabilizing transformation to correct for this [Mayer et al., 2018, Hafemeister and Satija, 2019]. To learn the mean-variance relationship from the data, we computed the mean and variance of each gene using the unnormalized data (i.e., UMI or counts matrix), and applied -transformation to both. We then fit a curve to predict the variance of each gene as a function of its mean, by calculating a local fitting of polynomials of degree 2 (R function loess, span = 0.3). This global fit provided us with a regularized estimator of variance given the mean of a feature. As such, we could use it to standardize feature counts without removing higher-than-expected variation. ```{r} #| label: seurat ctrl = all[,all$orig.ident == "ctrl_13"] ctrl@assays$RNA@meta.data[,1:ncol(ctrl@assays$RNA@meta.data)] = NULL ctrl = FindVariableFeatures(ctrl) hvg_seu = list() hvg_seu$vst = VariableFeatures(ctrl) #top20 <- head(hvg_seu$vst, 20) #LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE) ctrl = FindVariableFeatures(ctrl, selection.method = "mean.var.plot") hvg_seu$mvp = VariableFeatures(ctrl) #top20 <- head(hvg_seu$mvp, 20) #LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE) ctrl = FindVariableFeatures(ctrl, selection.method = "dispersion") hvg_seu$disp = VariableFeatures(ctrl) #top20 <- head(hvg_seu$disp, 20) #LabelPoints(plot = VariableFeaturePlot(ctrl), points = top20, repel = TRUE) cmat = make_comb_mat(hvg_seu) cmat ``` Very little overlap between the methods in Seurat. Mvp and disp are calculated on `data`, while vst is done on `counts`. ```{r} #| label: plot-seurat vinfo.seu = ctrl@assays$RNA@meta.data rownames(vinfo.seu) = rownames(ctrl) vinfo.seu$vg_vst = rownames(vinfo.seu) %in% hvg_seu$vst vinfo.seu$vg_disp = rownames(vinfo.seu) %in% hvg_seu$disp vinfo.seu$vg_mvp = rownames(vinfo.seu) %in% hvg_seu$mvp vinfo.seu$vg_vst_disp = vinfo.seu$vg_vst + vinfo.seu$vg_disp == 2 vinfo.seu$hvinfo = ifelse(vinfo.seu$vg_vst, "VST",NA) vinfo.seu$hvinfo[vinfo.seu$vg_mvp] = "MVP" vinfo.seu$hvinfo[vinfo.seu$vg_disp] = "DISP" vinfo.seu$hvinfo[vinfo.seu$vg_vst_disp] = "VST/DISP" means = colnames(vinfo.seu)[grepl("mean", colnames(vinfo.seu))] disp = c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","vf_mvp_data.ctrl_13_mvp.dispersion.scaled") pairs(vinfo.seu[,means]) plot_hvg = function(df, m,d,vg, log=FALSE){ top20 = head(vg,20) p = ggplot(df, aes(x=.data[[m]], y=.data[[d]], color=hvinfo)) + geom_point() + theme_classic() if (log) { p = p + scale_x_log10()} LabelPoints(plot = p, points = top20, repel = TRUE) } p1S = plot_hvg(vinfo.seu, "vf_vst_counts.ctrl_13_mean", "vf_vst_counts.ctrl_13_variance.standardized",hvg_seu$vst, log=TRUE) p2S = plot_hvg(vinfo.seu, "vf_disp_data.ctrl_13_mvp.mean", "vf_disp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$disp) p3S = plot_hvg(vinfo.seu, "vf_mvp_data.ctrl_13_mvp.mean", "vf_mvp_data.ctrl_13_mvp.dispersion.scaled",hvg_seu$mvp) wrap_plots(p1S,p2S,p3S, ncol=2) ``` ### Bioc Run the bioconductor detection method with `modelGeneVar` and `getTopHVGs`. Plot both total variance and "bio" variance. ```{r} #| label: bioc2 ctrl.sce = as.SingleCellExperiment(ctrl) var.out <- modelGeneVar(ctrl.sce) hvgs_bioc <- getTopHVGs(var.out, n = 2000) var.out.df = data.frame(var.out) var.out.df$hvg = rownames(var.out) %in% hvgs_bioc p = ggplot(var.out.df, aes(x=mean, y=total, colour = hvg)) + geom_point() + theme_classic() top20 <- head(hvgs_bioc, 20) pB = LabelPoints(plot = p, points = top20, repel = TRUE) pB p2 = ggplot(var.out.df, aes(x=mean, y=bio, colour = hvg)) + geom_point() + theme_classic() top20 <- head(hvgs_bioc, 20) pB2 = LabelPoints(plot = p2, points = top20, repel = TRUE) pB2 ``` ### Scanpy Run with both `seurat` (on lognorm data) and `seurat_v3` (on counts, is same as vst) For the dispersion-based methods (flavor='seurat' Satija et al. [2015] and flavor='cell_ranger' Zheng et al. [2017]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. For flavor='seurat_v3'/'seurat_v3_paper' [Stuart et al., 2019], a normalized variance for each gene is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. Only if batch_key is not None, the two flavors differ: For flavor='seurat_v3', genes are first sorted by the median (across batches) rank, with ties broken by the number of batches a gene is a HVG. For flavor='seurat_v3_paper', genes are first sorted by the number of batches a gene is a HVG, with ties broken by the median (across batches) rank. ```{r} #| label: scanpy penv = "/Users/asabjor/miniconda3/envs/scanpy_2024_nopip" hvg.scanpy = basiliskRun(env=penv, fun=function(counts) { scanpy <- reticulate::import("scanpy") ad = reticulate::import("anndata") adata = ad$AnnData(counts) print(adata$X[1:10,1:10]) var1 = scanpy$pp$highly_variable_genes(adata, flavor = "seurat_v3", inplace=FALSE) scanpy$pp$normalize_per_cell(adata, counts_per_cell_after=1e4) scanpy$pp$log1p(adata) print(adata$X[1:10,1:10]) scanpy$pp$highly_variable_genes(adata) return(list(disp=adata$var, vst=var1)) }, counts = t(ctrl@assays$RNA@layers$counts.ctrl_13), testload="scanpy") #flavor #Literal['seurat', 'cell_ranger', 'seurat_v3', 'seurat_v3_paper'] (default: 'seurat') ``` ```{r} #| label: scanpy2 rownames(hvg.scanpy$disp) = rownames(ctrl) rownames(hvg.scanpy$vst) = rownames(ctrl) top20 <- head(rownames(hvg.scanpy$disp)[order(hvg.scanpy$disp$dispersions_norm, decreasing = T)], 20) p = ggplot(hvg.scanpy$disp, aes(x=means, y=dispersions_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy disp") pS = LabelPoints(plot = p, points = top20, repel = TRUE) pS top20 <- head(rownames(hvg.scanpy$vst)[order(hvg.scanpy$vst$variances_norm, decreasing = T)], 20) p2 = ggplot(hvg.scanpy$vst, aes(x=means, y=variances_norm, colour = highly_variable)) + geom_point() + theme_classic() + ggtitle("Scanpy vst") + scale_x_log10() pS2 = LabelPoints(plot = p2, points = top20, repel = TRUE) pS2 ``` ### All together ```{r} #| label: merge-all colnames(hvg.scanpy$disp) = paste0(colnames(hvg.scanpy$disp), "_scpyD") colnames(hvg.scanpy$vst) = paste0(colnames(hvg.scanpy$vst), "_scpyV") colnames(var.out) = paste0(colnames(var.out), "_bioc") ctrl.hvg = cbind(vinfo.seu, var.out, hvg.scanpy$disp, hvg.scanpy$vst) ``` ```{r} #| label: plot-all #| fig-height: 10 #| fig-width: 12 wrap_plots(p1S + ggtitle("Seurat vst"),p2S + ggtitle("Seurat disp"),p3S + ggtitle("Seurat mvp"),pB + ggtitle("Bioc total"),pS,pS2, ncol=3) ``` ```{r} #| label: disps-all sel = c("vf_vst_counts.ctrl_13_variance.standardized","vf_disp_data.ctrl_13_mvp.dispersion.scaled","bio_bioc", "dispersions_norm_scpyD", "variances_norm_scpyV") pairs(ctrl.hvg[,sel]) ``` ```{r} #| label: means-all sel = c("vf_vst_counts.ctrl_13_mean_log","vf_disp_data.ctrl_13_mvp.mean", "mean_bioc", "means_scpyD", "means_scpyV_log") ctrl.hvg$vf_vst_counts.ctrl_13_mean_log = log1p(ctrl.hvg$vf_vst_counts.ctrl_13_mean) ctrl.hvg$means_scpyV_log = log1p(ctrl.hvg$means_scpyV) pairs(ctrl.hvg[,sel]) ``` ```{r} #| label: overlap hvgs = hvg_seu hvgs$scpyD = rownames(hvg.scanpy$disp)[hvg.scanpy$disp$highly_variable_scpyD] hvgs$scpyV = rownames(hvg.scanpy$disp)[hvg.scanpy$vst$highly_variable_scpyV] hvgs$bioc = hvgs_bioc o = overlap_phyper2(hvgs,hvgs, remove.diag = T) ``` Mean Bioc stands out the most. Vst in seurat and scanpy is identical and very different from the other methods. But it is also based on counts instead of lognorm data. The variance estimate for the same sample is quite different. Even the top variable genes are not the same and are very different gene groups. ### Top genes Explore top genes with the different methods. disp and mvp are identical, remove mvp. ScanpyV and vst are identical, remove scpyV. ```{r} #| label: top10 topG = lapply(hvgs, head, 10) # same for mvp and disp, remove one of them. topG$mvp = NULL topG$scpyV = NULL # sort the genes for scanpy. topG$scpyD = rownames(ctrl.hvg)[order(ctrl.hvg$dispersions_norm_scpyD, decreasing = T)][1:10] # rank the genes allG = unique(unlist(topG)) ranks = Reduce(rbind, lapply(topG, function(x) { r = 1:10 names(r) = x r2 = r[allG] return(r2) })) colnames(ranks) = allG rownames(ranks) = names(topG) pheatmap::pheatmap(ranks, display_numbers = ranks, main="Rank of top 10 gens", cluster_rows = F, cluster_cols = F) ``` Plot onto umap for the unique ones. ```{r} #| label: top10-umap #| fig-height: 10 #| fig-width: 12 topG$disp = NULL t = table(unlist(topG)) selG = names(t)[t==1] selG = sapply(selG, function(y) names(which(unlist(lapply(topG, function(x) any(x==y)))))) selG = sort(selG) #small.leg <- theme(legend.text = element_text(size=3), legend.key.size = unit(0.5,"point")) plots = list() for (g in names(selG)){ plots[[g]] = FeaturePlot(ctrl, reduction = "umap_harmony", features = g, order = T) + ggtitle(paste(g,selG[g], collapse = " - ")) + NoAxes() #+ small.leg } wrap_plots(plots, ncol = 4) ``` More clear celltype genes in the Bioc selection, but also higher expressed genes.. More B-cell genes for vst. ### Expression levels Expression levels of the variable genes. As total counts or mean expression across all cells. ```{r} #| label: exprs #| fig-height: 8 #| fig-width: 12 ctrl.hvg$nC = rowSums(ctrl@assays$RNA@layers$counts.ctrl_13>0) ctrl.hvg$meanE = rowMeans(ctrl@assays$RNA@layers$data.ctrl_13) ctrl.hvg$vg_bioc = rownames(ctrl.hvg) %in% hvgs_bioc wrap_plots( ggplot(ctrl.hvg, aes(x=nC, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"), ggplot(ctrl.hvg, aes(x=nC, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"), ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"), ggplot(ctrl.hvg, aes(x=nC, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"), ggplot(ctrl.hvg, aes(x=nC, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Bioc"), ggplot(ctrl.hvg, aes(x=meanE, fill=vg_vst)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("VST"), ggplot(ctrl.hvg, aes(x=meanE, fill=vg_disp)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Disp"), ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyD)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy Disp"), ggplot(ctrl.hvg, aes(x=meanE, fill=highly_variable_scpyV)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend() + ggtitle("Scanpy VST"), ggplot(ctrl.hvg, aes(x=meanE, fill=vg_bioc)) + geom_histogram( alpha=0.5, position="identity") + scale_x_log10() + NoLegend()+ ggtitle("Bioc"), ncol =5 ) ``` BioC distibution shifted to more highly expressed genes. Dispersion gives the very highly expressed genes, but also has a spread among all levels of expression. ## Discussion * Seurat v3 paper suggests vst on counts, but then uses the variable genes in lognorm space, how is this the best option_ * From SC best practices, suggests scry for HVG selection. https://www.sc-best-practices.org/preprocessing_visualization/feature_selection.html ### scry Run scry deviance function and select top 2k genes. ```{r} #| label: scry ctrl.sce<-scry::devianceFeatureSelection(ctrl.sce, assay="counts", sorted=TRUE) plot(rowData(ctrl.sce)$binomial_deviance, type="l", xlab="ranked genes", ylab="binomial deviance", main="Feature Selection with Deviance") abline(v=2000, lty=2, col="red") head(rowData(ctrl.sce),10) ``` Top genes are similar to the other methods. ```{r} #| label: scry-plot ctrl.hvg$deviance = rowData(ctrl.sce)$binomial_deviance[match(rownames(ctrl.hvg), rownames(rowData(ctrl.sce)))] ctrl.hvg$devG = rownames(ctrl.hvg) %in% head(rownames(rowData(ctrl.sce)),2000) wrap_plots( ggplot(ctrl.hvg, aes(x=nC, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("nCell vs deviance") + NoLegend(), ggplot(ctrl.hvg, aes(x=meanE, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("mean expression vs deviance")+ NoLegend(), ggplot(ctrl.hvg, aes(x=dispersions_norm_scpyD, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("Dispersion vs deviance")+ NoLegend(), ggplot(ctrl.hvg, aes(x=variances_norm_scpyV, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("VST vs deviance")+ NoLegend(), ggplot(ctrl.hvg, aes(x=bio_bioc, y=deviance, color=devG)) + geom_point() + theme_classic() + ggtitle("bioc bio vs deviance")+ NoLegend(), ncol = 3 ) ``` Selects all genes with high expression ```{r} #| label: scry-overlap hvgs$deviance = head(rownames(rowData(ctrl.sce)),2000) o = overlap_phyper2(hvgs,hvgs, remove.diag = T) ``` ## {{< meta session >}}
Click here ```{r} #| label: session sessionInfo() ```