--- title: "Comparison of normalization" subtitle: "{{< meta subtitle_seurat >}} {{< meta subtitle_bioc >}} {{< meta subtitle_scanpy >}}" description: "Overview of different normalization methods" format: html --- Run different methods of normalization and look at how the distributions look after norm. * Lognorm with different scale factors. * SCT with all cells or per sample. How are residuals effected? * TMM, Quantile. * Deconvolution * Pearson residuals in scanpy ```{r} #| label: libraries suppressPackageStartupMessages({ library(Seurat) library(Matrix) library(ggplot2) library(patchwork) library(scran) library(basilisk) library(dplyr) }) devtools::source_url("https://raw.githubusercontent.com/asabjorklund/single_cell_R_scripts/main/overlap_phyper_v2.R") ``` # Load data ```{r} #| label: load path_seurat = "../seurat/data/covid/results/" # seurat sobj = readRDS(file.path(path_seurat,"seurat_covid_qc_dr_int_cl.rds")) sobj ``` The object has the data split into layers, both counts and data. One single scale.data layer. Plot integrated data: ```{r} #| label: umaps #| fig-height: 4 #| fig-width: 10 wrap_plots( DimPlot(sobj, group.by = "RNA_snn_res.0.5", reduction = "umap_cca") + NoAxes(), DimPlot(sobj, group.by = "orig.ident", reduction = "umap_cca") + NoAxes(), FeaturePlot(sobj, "nFeature_RNA", reduction = "umap_cca") + NoAxes(), FeaturePlot(sobj, "percent_mito", reduction = "umap_cca") + NoAxes(), ncol = 2 ) sobj = SetIdent(sobj, value = "RNA_snn_res.0.5") sobj@active.assay = "RNA" ``` Select a set of variable genes to analyse across all normalizations, 5K genes. ```{r} #| label: hvgs sobj = FindVariableFeatures(sobj, nfeatures = 5000, verbose = F) hvg = VariableFeatures(sobj) ``` # Normalize ## SCT per layer First, run SCT on each layer, e.g. each sample. ```{r} #| label: sctl sobj = SCTransform(sobj, new.assay.name = "SCTL", verbose = F) dim(sobj@assays$SCTL@counts) dim(sobj@assays$SCTL@data) dim(sobj@assays$SCTL@scale.data) ``` First runs SCT for each layer separately, then creates one merged counts, data, scale.data. Fewer genes in the SCT counts/data matrix. ```{r} #| label: sctl-var vg = list() for (n in names(sobj@assays$SCTL@SCTModel.list)){ vg[[n]] = rownames(sobj@assays$SCTL@SCTModel.list[[n]]@feature.attributes)[sobj@assays$SCTL@SCTModel.list[[n]]@feature.attributes$genes_log_gmean_step1] } o = overlap_phyper2(vg,vg, bg = nrow(sobj@assays$RNA)) all.vg = unique(unlist(vg)) length(all.vg) dim(sobj@assays$SCTL@scale.data) sum(rownames(sobj@assays$SCTL@scale.data) %in% all.vg) ``` Total variable genes from each dataset is more than the size of `scale.data`. The function first runs SCT for each dataset, then runs Seurat VariableFeatures on each object. and takes the union! ``` # from the seurat function: vf.list <- lapply(X = sct.assay.list, FUN = function(object.i) VariableFeatures(object = object.i)) variable.features.union <- Reduce(f = union, x = vf.list) ``` Those genes are used to calculate the residuals in scale.data. Still, among those genes, only 4437 are among the first set of 5000 variable genes. Calculate residuals for all genes: ```{r} #| label: prep-sctl sobj = PrepSCTFindMarkers(sobj, assay = "SCTL") #, anchor.features = intersect(rownames(sobj), hvg))$s ``` ## Annotation Merge the layers and run all normalization with all genes together. Filter genes with less than 5 cells. ```{r} #| label: join-layers sobj@active.assay = "RNA" sobj <- JoinLayers(object = sobj, layers = c("data","counts")) # remove genes with less than 5 cells. nC = rowSums(sobj@assays$RNA@layers$counts > 0) sobj = sobj[nC>4,] sobj = NormalizeData(sobj, verbose = F) # add the normalized to a list ndata = list() C = sobj@assays$RNA@layers$counts rownames(C) = rownames(sobj) colnames(C) = colnames(sobj) ndata$counts = C D = sobj@assays$RNA@layers$data rownames(D) = rownames(sobj) colnames(D) = colnames(sobj) ndata$lognorm10k = D # also add in the SCTL data ndata$sctl_counts = sobj@assays$SCTL@counts ndata$sctl_data = sobj@assays$SCTL@data ndata$sctl_scaledata = sobj@assays$SCTL@scale.data ``` Add celltype annotation from Azimuth. Define as broad celltypes based on the clusters for visualization. ```{r} #| label: celltypes celltype.file = "data/celltype_azimuth.csv" if (file.exists(celltype.file)){ ct = read.csv(celltype.file, row.names = 1) sobj$celltype = ct$celltype sobj$predicted.celltype.l1 = ct$predicted.celltype.l1 }else { library(Azimuth) sobj <- RunAzimuth(sobj, reference = "pbmcref") annot = list("0"="Mono","5"="Mono","8"="Other","9"="Mix","3"="BC","6"="BC","1"="NK","2"="TC","4"="TC", "7"="TC") sobj$celltype = unname(unlist(annot[as.character(sobj@active.ident)])) ct = sobj@meta.data[,c("celltype","predicted.celltype.l1")] write.csv(ct, file=celltype.file) } p1 = DimPlot(sobj, label = T, reduction = "umap_cca") + NoAxes() p2 = DimPlot(sobj, reduction = "umap_cca", group.by = "predicted.celltype.l1", label = T) + NoAxes() p3 = DimPlot(sobj, reduction = "umap_cca", group.by = "celltype", label = T) + NoAxes() wrap_plots(p1,p2,p3, ncol=2) sobj = SetIdent(sobj, value = "celltype") ``` ## SCT merged Run with all cells considered a single sample. ```{r} #| fig.height: 6 #| label: sct sobj = SCTransform(sobj, new.assay.name = "SCT", verbose = F) ndata$sct_data = sobj@assays$SCT@data ndata$sct_counts = sobj@assays$SCT@counts VlnPlot(sobj, c("CD3E","CD14","CCR7","RPL10"), assay = "SCT", slot = "data", ncol=4) VlnPlot(sobj, c("CD3E","CD14","CCR7","RPL10"), assay = "SCTL", slot = "data", ncol=4) ``` Returns a Seurat object with a new assay (named SCT by default) with counts being (corrected) counts, data being log1p(counts), scale.data being pearson residuals ```{r} #| label: sct-dims dim(sobj@assays$RNA@layers$counts) dim(sobj@assays$SCT@counts) dim(sobj@assays$SCT@scale.data) ``` ```{r} #| label: sct-plot #| fig.height: 8 wrap_plots( FeatureScatter(sobj, "nCount_RNA","nCount_SCT", group.by = "orig.ident") + NoLegend(), FeatureScatter(sobj, "nFeature_RNA","nFeature_SCT", group.by = "orig.ident") + NoLegend(), FeatureScatter(sobj, "nCount_RNA","nCount_SCTL", group.by = "orig.ident") + NoLegend(), FeatureScatter(sobj, "nFeature_RNA","nFeature_SCTL", group.by = "orig.ident")+ NoLegend(), ncol = 2 ) range(sobj$nFeature_RNA) range(sobj$nFeature_SCT) VlnPlot(sobj, c("nCount_RNA","nCount_SCT", "nCount_SCTL")) + NoLegend() VlnPlot(sobj, c("nFeature_RNA","nFeature_SCT", "nFeature_SCTL")) + NoLegend() ``` Much lower number of features per cell, especially in the high nF cells. But also higher lowest number of features. How can the nFeatures increase? Seems like it is mainly genes that are high in most cells that get "imputed" counts in SCT, but they are still low. Plot stats per gene: ```{r} #| label: sct-gene-plot df.genes = data.frame( nCell_RNA=rowSums(ndata$counts>0), nUMI_RNA=rowSums(ndata$counts), nCell_SCT=rowSums(ndata$sct_counts>0), nUMI_SCT=rowSums(ndata$sct_counts)) wrap_plots( ggplot(df.genes, aes(x=nCell_RNA,y=nCell_SCT)) + geom_point() + theme_classic(), ggplot(df.genes, aes(x=nUMI_RNA,y=nUMI_SCT)) + geom_point() + theme_classic() + scale_x_log10() + scale_y_log10(), ggplot(df.genes, aes(x=nUMI_RNA,y=nCell_RNA)) + geom_point() + theme_classic() + scale_x_log10(), ggplot(df.genes, aes(x=nUMI_SCT,y=nCell_SCT)) + geom_point() + theme_classic() + scale_x_log10(), ncol = 2 ) ``` Each gene is found in a similar number of cells, but the nUMI per gene is much lower with SCT Get residuals for all of the hvgs ```{r} #| label: sct-prep sobj@active.assay = "SCT" sobj = PrepSCTIntegration(list(s = sobj), anchor.features = intersect(rownames(sobj), hvg))$s # some of the hvgs are not in the SCT assay! ndata$sct_scaledata = sobj@assays$SCT@scale.data dim(sobj@assays$SCT@scale.data) ``` Many cells have counts for genes where there was no detection. Check for instance chrY genes and the XIST gene. That we know should mainly be in males/females. ```{r} #| label: chrYgenes genes_file <- file.path(path_seurat, "genes_table.csv") genes.table <- read.csv(genes_file) genes.table <- genes.table[genes.table$external_gene_name %in% rownames(sobj), ] par1 = c(10001, 2781479) par2 = c(56887903, 57217415) p1.gene = genes.table$external_gene_name[genes.table$start_position > par1[1] & genes.table$start_position < par1[2] & genes.table$chromosome_name == "Y"] p2.gene = genes.table$external_gene_name[genes.table$start_position > par2[1] & genes.table$start_position < par2[2] & genes.table$chromosome_name == "Y"] chrY.gene <- genes.table$external_gene_name[genes.table$chromosome_name == "Y"] chrY.gene = setdiff(chrY.gene, c(p1.gene, p2.gene)) sobj <- PercentageFeatureSet(sobj, features = chrY.gene, col.name = "pct_chrY_SCT", assay = "SCT") # SCTL has fewer genes in the count matrix, sobj <- PercentageFeatureSet(sobj, features = intersect(chrY.gene, rownames(sobj@assays$SCTL@counts)), col.name = "pct_chrY_SCTL", assay = "SCTL") sobj <- PercentageFeatureSet(sobj, features = chrY.gene, col.name = "pct_chrY_RNA", assay = "RNA") VlnPlot(sobj, group.by = "orig.ident", features = c("pct_chrY_RNA", "pct_chrY_SCT","pct_chrY_SCTL")) ``` ChrY genes have low counts in the female cells with SCT. But not in SCTL which is run per sample, so SCT borrows information across the samples. ```{r} #| label: sex-plot #| fig.height: 8 p1 = VlnPlot(sobj, group.by = "orig.ident", features = "XIST", assay = "RNA", slot = "data") + NoLegend() p2 = VlnPlot(sobj, group.by = "orig.ident", features = "XIST", assay = "SCT", slot = "data") + NoLegend() p3 = VlnPlot(sobj, group.by = "orig.ident", features = "XIST", assay = "SCTL", slot = "data") + NoLegend() p1 + p2 + p3 ``` Same for XIST expression. ## Subsample From now, use a smaller dataset. Select 10 cells per sample and cluster to make a smaller dataset for the comparison. Use resolution 1 with 11 clusters. Then filter again for genes detected in at least 5 cells. ```{r} #| label: subsample dim(sobj) sobj@active.assay = "RNA" sobj = DietSeurat(sobj, assays = "RNA") sobj$clust_sample = paste0(sobj$RNA_snn_res.1, sobj$orig.ident) sobj = SetIdent(sobj, value = "clust_sample") sobj = subset(sobj, cells = WhichCells(sobj, downsample = 10)) nC = rowSums(sobj@assays$RNA@layers$counts > 0) sobj = sobj[nC>4,] dim(sobj) ``` Also subset all normdata for the same cells/genes, and also the subset of variable genes ```{r} #| label: subset-ndata hvg = intersect(rownames(sobj),hvg) for (n in names(ndata)){ tmp = intersect(hvg, rownames(ndata[[n]])) ndata[[n]] = ndata[[n]][tmp,colnames(sobj)] } # clean memory gc() ``` ## Scalefactors lognorm First plot counts and default lognorm (sf=10K) and calculate scale.data for the different normalizations. Also calculate hvgs to get dispersions at different scale factors. ### SF 10K ```{r} #| label: lognorm10k sobj = SetIdent(sobj, value = "RNA_snn_res.0.5") #VlnPlot(sobj, c("CD3E","CD14","CCR7","RPL10"), slot = "counts", ncol=4) #VlnPlot(sobj, c("CD3E","CD14","CCR7","RPL10"), slot = "data", ncol=4) # vagenes for 10k vinfo = list() tmp = FindVariableFeatures(sobj, selection.method = "mean.var.plot", verbose = F, assay = "RNA") vinfo[["10k"]] = tmp@assays$RNA@meta.data sobj = ScaleData(sobj, features = hvg, assay = "RNA") D = sobj@assays$RNA@layers$scale.data rownames(D) = hvg colnames(D) = colnames(sobj) ndata$lognorm10k_scaledata = D cd3plots = list() cd3plots[[1]] = VlnPlot(sobj, features = "CD3E") + NoAxes() + ggtitle("10K") + NoLegend() ``` ### SF 1K ```{r} #| label: lognorm1k tmp = NormalizeData(sobj, scale.factor = 1000, verbose = F) tmp = FindVariableFeatures(tmp, selection.method = "mean.var.plot", verbose = F, assay = "RNA") vinfo[["1k"]] = tmp@assays$RNA@meta.data D = tmp@assays$RNA@layers$data rownames(D) = rownames(sobj) colnames(D) = colnames(sobj) ndata$lognorm1k = D[hvg,] #VlnPlot(tmp, c("CD3E","CD14","CCR7","RPL10"), slot = "data", ncol=4) tmp = ScaleData(tmp, features = hvg, assay = "RNA") D = tmp@assays$RNA@layers$scale.data rownames(D) = hvg colnames(D) = colnames(sobj) ndata$lognorm1k_scaledata = D cd3plots[[2]] = VlnPlot(tmp, features = "CD3E") + NoAxes() + ggtitle("1K") + NoLegend() ``` ### SF 100K ```{r} #| label: lognorm100k tmp = NormalizeData(sobj, scale.factor = 100000, verbose = F) D = tmp@assays$RNA@layers$data rownames(D) = rownames(sobj) colnames(D) = colnames(sobj) ndata$lognorm100k = D[hvg,] tmp = ScaleData(tmp, features = hvg, assay = "RNA") D = tmp@assays$RNA@layers$scale.data rownames(D) = hvg colnames(D) = colnames(sobj) ndata$lognorm100k_scaledata = D tmp = FindVariableFeatures(tmp, selection.method = "mean.var.plot", verbose = F, assay = "RNA") vinfo[["100k"]] = tmp@assays$RNA@meta.data #VlnPlot(tmp, c("CD3E","CD14","CCR7","RPL10"), slot = "data", ncol=4) cd3plots[[3]] = VlnPlot(tmp, features = "CD3E") + NoAxes() + ggtitle("100K") + NoLegend() wrap_plots(cd3plots, ncol = 3) ``` Clearly gives a larger difference to zero with larger size factors. So the dispersion will be higher. In principle is the same as changing the pseudocount which is now 1. ### Dispersions Plot the estimated dispersions with mvp vs mean. ```{r} #| label: dispersions vinfo = vinfo[sort(names(vinfo))] plots = list() for (n in names(vinfo)){ plots[[n]] = ggplot(vinfo[[n]], aes(x=vf_mvp_data_mvp.mean, y=vf_mvp_data_mvp.dispersion.scaled, color = vf_mvp_data_variable)) + geom_point()+ ggtitle(paste0(n, " - ", sum(vinfo[[n]]$vf_mvp_data_variable), " hvgs")) + theme_classic() + NoLegend() } wrap_plots(plots, ncol=2) ``` Much higher dispersions and more variable genes with 100K, very low with 1k. The cutoffs are dispersions over 1 and mean 0.1-8 ```{r} #| label: dispersions-pair disp = data.frame(Reduce(cbind,lapply(vinfo, function(x) x$vf_mvp_data_mvp.dispersion.scaled))) colnames(disp) = names(vinfo) plot(disp) ``` Still similar trend on the dispersions, the same genes have high dispersion. ## Logcounts Log of counts without depth normalization ```{r} #| label: logcounts ndata$logcounts = log1p(ndata$counts) ``` ## TMM Run EdgeR TMM, ```{r} #| eval: true #| label: tmm # TMM dge = edgeR::DGEList(sobj@assays$RNA@layers$counts) dge <- edgeR::normLibSizes(dge, method = "TMM") tmp = edgeR::cpm(dge) # is with log = FALSE colnames(tmp) = colnames(sobj) tmp = tmp[rownames(sobj) %in% hvg,] rownames(tmp) = hvg ndata$tmm = tmp tmp = edgeR::cpm(dge, log = TRUE) colnames(tmp) = colnames(sobj) tmp = tmp[rownames(sobj) %in% hvg,] rownames(tmp) = hvg ndata$tmm_log = tmp ``` ## Quantile limma-voom quantile ```{r} #| label: quantile # should be run on the lognorm values. use the full dataset to calculate. tmp = limma::voom(sobj@assays$RNA@layers$data,normalize.method = "quantile")$E colnames(tmp) = colnames(sobj) rownames(tmp) = rownames(sobj) ndata$quantile = tmp[hvg,] ``` ## Deconvolution Run `quickCluster` and then `computeSumFactors`. ```{r} #| label: deconv library(scran) set.seed(100) sce = as.SingleCellExperiment(sobj) cl = quickCluster(sce) table(cl) sce <- computeSumFactors(sce, cluster=cl, min.mean=0.1) sce <- logNormCounts(sce) assayNames(sce) tmp = assay(sce, "logcounts") tmp = tmp[hvg,] ndata$deconv = tmp ``` ## Pearson residuals Using the method implemented in scanpy. ```{r} #| label: pearson penv = "/Users/asabjor/miniconda3/envs/scanpy_2024_nopip" norm.scanpy = basiliskRun(env=penv, fun=function(counts) { scanpy <- reticulate::import("scanpy") ad = reticulate::import("anndata") adata = ad$AnnData(counts) print(adata$X[1:10,1:10]) #sc.experimental.pp.normalize_pearson_residuals(adata) scanpy$experimental$pp$normalize_pearson_residuals(adata) return(list(norm=adata$X)) }, counts = t(sobj@assays$RNA@layers$counts), testload="scanpy") rownames(norm.scanpy$norm) = colnames(sobj) colnames(norm.scanpy$norm) = rownames(sobj) ndata$pearson = t(norm.scanpy$norm[,hvg]) ``` ## CD3 violins Plot cd3 violins for these different normalizations. ```{r} #| fig.width: 14 #| fig.height: 14 #| label: cd3 cd3 = data.frame(Reduce(cbind, lapply(ndata, function(x) x["CD3E",]))) colnames(cd3)=names(ndata) cd3$clusters = sobj@active.ident df = reshape2::melt(cd3, id = "clusters") df$variable = factor(df$variable, levels = sort(names(ndata))) ggplot(df, aes(x=clusters, y=value, fill=clusters)) + geom_violin(trim = T, scale = "width" , adjust = 1) + facet_wrap(~variable, scales = "free") ``` # Distributions Plot the expression distribution for a few single cells using each method. Select cells with highest/medium/lowest total counts. ```{r} #| label: select-cells cs = colSums(ndata$counts) r = order(cs) sel = r[c(1,460,length(r))] cs[sel] ``` ```{r} #| fig.height: 14 #| warning: false #| label: distributions subset = lapply(ndata, function(x) x[,sel]) plots = list() for (ds in sort(names(subset))){ l = reshape2::melt(data.frame(subset[[ds]])) plots[[ds]] = ggplot(l, aes(x=value)) + geom_histogram(bins = 200) + theme_classic() + facet_wrap(~variable, scales="free") + scale_y_log10() + ggtitle(ds) } wrap_plots(plots[1:6], ncol=1) wrap_plots(plots[7:12], ncol=1) wrap_plots(plots[13:length(plots)], ncol=1) ``` # Cell correlations For all the different normalizations, calculate the correlation between cells. For a fair comparison, use the same set of genes. Except for the scaledata for SCTL that has too few genes! ```{r} #| label: cor-function # corSparse function copied from Signac package utilities.R corSparse <- function(X, Y = NULL, cov = FALSE) { X <- as(object = X, Class = "CsparseMatrix") n <- nrow(x = X) muX <- colMeans(x = X) if (!is.null(x = Y)) { if (nrow(x = X) != nrow(x = Y)) { stop("Matrices must contain the same number of rows") } Y <- as(object = Y, Class = "CsparseMatrix") muY <- colMeans(x = Y) covmat <- ( as.matrix(x = crossprod(x = X, y = Y)) - n * tcrossprod(x = muX, y = muY) ) / (n-1) sdvecX <- sqrt( (colSums(x = X^2) - n*muX^2) / (n-1) ) sdvecY <- sqrt( (colSums(x = Y^2) - n*muY^2) / (n-1) ) cormat <- covmat / tcrossprod(x = sdvecX, y = sdvecY) } else { covmat <- ( as.matrix(crossprod(x = X)) - n * tcrossprod(x = muX) ) / (n-1) sdvec <- sqrt(x = diag(x = covmat)) cormat <- covmat / tcrossprod(x = sdvec) } if (cov) { dimnames(x = covmat) <- NULL return(covmat) } else { dimnames(x = cormat) <- NULL return(cormat) } } ``` ```{r} #| label: correl # have fewer genes for sctl-scaledata, still use the same set for all other samples ignoring that sctl has fewer! t = table(unlist(lapply(ndata, rownames))) selG = names(t)[t >= (length(ndata)-1)] ndata = lapply(ndata, function(x) x[intersect(rownames(x),selG),]) cors = list() for (n in sort(names(ndata))){ print(n) cors[[n]] = corSparse(ndata[[n]]) } #cors = lapply(ndata, function(x) corSparse(x)) ``` ```{r} #| label: correl-plot breaks = seq(-0.5,1,0.01) cc = lapply(cors, function(x) hist(x[upper.tri(x)],breaks=breaks,plot=F)$counts) count.hist = data.frame(Reduce(cbind,cc)) colnames(count.hist) = names(ndata) count.hist$breaks = breaks[-1] c = Reduce(cbind, lapply(cors, function(x) x[upper.tri(x)])) colnames(c) = names(cors) long = reshape2::melt(c) ggplot(long, aes(x=value)) + geom_histogram(bins=100) + facet_wrap(~Var2, scales = "free_y") + geom_vline(xintercept =0, color="red") stats = apply(c, 2, summary) t(stats) ``` # Clustering Run a quick umap/clustering with the different normalizations. All with same set of genes, pca with 20 PCs, ```{r} #| label: cluster # need to run the commands to get all setting correct in the seurat object. sobj.all = CreateSeuratObject(counts = ndata$counts[selG,]) sobj.all = NormalizeData(sobj.all, verbose = F) sobj.all = ScaleData(sobj.all, verbose = F) sobj.all$old_clust = sobj$RNA_snn_res.0.5 sobj.all$sample = sobj$orig.ident # then run per dataset for (ds in sort(names(ndata))){ if (ds == "sctl_scaledata") { next } d = as(ndata[[ds]][selG,], "matrix") sobj.all@assays$RNA@layers$scale.data = d sobj.all = sobj.all %>% RunPCA(npcs=50,verbose = F, features = selG) %>% RunUMAP(dims=1:20, verbose = F) %>% FindNeighbors(dims=1:20, verbose = F) %>% FindClusters(resolution = 0.6, verbose = F) sobj.all[[paste0("umap_",ds)]] = sobj.all[["umap"]] sobj.all[[paste0("clusters_",ds)]] = sobj.all[["seurat_clusters"]] sobj.all[[paste0("pca_",ds)]] = sobj.all[["pca"]] } ``` Gene detection in all: ```{r} #| label: features plots = list() for (ds in sort(names(ndata))){ if (ds == "sctl_scaledata") { next } plots[[ds]] = FeaturePlot(sobj.all, features = "nFeature_RNA", reduction = paste0("umap_",ds), pt.size = .2) + NoLegend() + NoAxes() + ggtitle(ds) + theme(plot.title = element_text(size = 6)) } wrap_plots(plots, ncol=4) ``` Celltype predictions in all: ```{r} #| label: celltype-plot plots = list() sobj.all$celltype = sobj[,colnames(sobj.all)]$celltype for (ds in sort(names(ndata))){ if (ds == "sctl_scaledata") { next } plots[[ds]] = DimPlot(sobj.all, group.by = "celltype", label = T, label.size = 2, reduction = paste0("umap_",ds), pt.size = .2) + NoLegend() + NoAxes() + ggtitle(ds) + theme(plot.title = element_text(size = 6)) } wrap_plots(plots, ncol=4) ``` ### PCs stats #### Contribution to variance How much variance is in each pc, among the 50 PCs calculated. Plot for first 10 PCs ```{r} #| label: pcs nplot = 10 pcs = names(sobj.all@reductions)[grepl("pca_", names(sobj.all@reductions))] std = lapply(pcs, function(x) Stdev(sobj.all, reduction = x)) variance_explained <- data.frame(Reduce(cbind,lapply(std, function(x) (x^2 / sum(x^2) * 100)[1:nplot]))) colnames(variance_explained) = pcs variance_explained$PC = factor(as.character(1:nrow(variance_explained)), levels = as.character(1:nrow(variance_explained))) df = reshape2::melt(variance_explained[1:10,]) ggplot(df, aes(x=PC, y=value)) + geom_bar(stat="identity") + facet_wrap(~variable) ``` For TMM and quantile, pretty much all variance is in PC1. More flat for the scaled data and for residuals. Also more flat with lower size factors. #### PC correlation to different stats Correlation to nFeatures - plot as R-squared. ```{r} #| label: pcs-features emb = lapply(pcs, function(x) Embeddings(sobj.all, reduction = x)) plot_pc_correl = function(emb, feature){ f = as.matrix(sobj[[feature]]) cors = Reduce(rbind,lapply(emb, function(x) { apply(x[,1:10],2,function(y) summary(lm(y ~ f))$r.squared) })) df = data.frame(t(cors)) colnames(df) = pcs df$PC = factor(as.character(1:10), levels = as.character(1:10)) df2 = reshape2::melt(df) df2$value = abs(df2$value) ggplot(df2, aes(x=PC, y=value)) + geom_bar(stat="identity") + facet_wrap(~variable) } plot_pc_correl(emb, "nFeature_RNA") ``` Most methods have PC1 dominated by nFeatures, but more so for logcounts, lognorm with large size factors and bulk methods. Same for mito genes and total counts: ```{r} #| label: pcs-qc plot_pc_correl(emb, "percent_mito") plot_pc_correl(emb, "nCount_RNA") ``` Vs celltypes: ```{r} #| label: pcs-celltype plot_pc_correl(emb, "celltype") ``` # DGE Run DGE detection between 2 clusters, do NK vs T-cell, using the differently normalized data and compare results. One parametric test (MAST), one rank-based (wilcoxon). Select 150 cells of each celltype. Just for lognorm, deconvolution, sct data, tmm-log and logcounts. ```{r} #| label: run-dge # select 150 cells from each celltype sobj = SetIdent(sobj, value = "celltype") selC = intersect(colnames(sobj)[which(sobj$celltype %in% c("NK","TC"))], WhichCells(sobj, downsample = 150)) tmp = sobj[selG,selC] # then run per dataset markersM = list() sel.methods = c("lognorm10k","deconv","logcounts","sct_data","tmm_log") for (ds in sel.methods){ d = as(ndata[[ds]][rownames(tmp),selC], "matrix") tmp@assays$RNA@layers$data = d markersM[[paste0("M-",ds)]] = FindMarkers(tmp, ident.1 = "TC", ident.2 = "NK", test.use = "MAST") markersM[[paste0("W-",ds)]] = FindMarkers(tmp, ident.1 = "TC", ident.2 = "NK", test.use = "wilcox") } ``` Select significant genes below 0.01. OBS! test only among 4.6K genes, so adjusted pvalue is lower than it should. ```{r} #| label: alltests signM = lapply(markersM, function(x) { up = rownames(x)[x$avg_log2FC>0 & x$p_val_adj < 0.01] down = rownames(x)[x$avg_log2FC<0 & x$p_val_adj < 0.01] return(list(up=up, down=down)) }) nM = Reduce(cbind, lapply(signM, function(x) c(length(x$up), length(x$down)))) colnames(nM) = names(signM) rownames(nM) = c("TC","NK") df = reshape2::melt(nM) ggplot(df, aes(x=Var2, y=value, fill = Var1)) + geom_bar(stat = "identity") + theme_classic() + RotatedAxis() + ggtitle("Number of significant genes") ``` Select all genes enriched in NK, top 100 genes ```{r} #| label: dge-nk l1 = lapply(signM, function(x) x$down[1:100]) o = overlap_phyper2(l1,l1, silent = T, remove.diag = T) m = o$M[1:length(l1), 1:length(l1)] diag(m) = NA pheatmap(m, display_numbers = m, main="top 100 NK genes") ``` Same for Tcell degs, fewer significant genes, take top 65 ```{r} #| label: dge-tc l1 = lapply(signM, function(x) x$up[1:65]) o = overlap_phyper2(l1,l1, silent = T, remove.diag = T) m = o$M[1:length(l1), 1:length(l1)] diag(m) = NA pheatmap(m, display_numbers = m, main="top 65 TC genes") ``` For TC genes 55 - 64 genes overlap among top 65. For NK genes 90 - 99 of top 100 genes overlap. Quite similar regardless of normalization. # {{< meta session >}}
Click here ```{r} #| label: session sessionInfo() ```