--- author: "Åsa Björklund & Paulo Czarnewski" date: '`r format(Sys.Date(), "%B %d, %Y")`' output: html_document: self_contained: true highlight: tango df_print: paged toc: yes toc_float: collapsed: false smooth_scroll: true toc_depth: 3 keep_md: yes fig_caption: true html_notebook: self_contained: true highlight: tango df_print: paged toc: yes toc_float: collapsed: false smooth_scroll: true toc_depth: 3 editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, warning=FALSE, result='hold',fig.width=12,tidy=TRUE) knitr::opts_knit$set(progress=TRUE,verbose=TRUE) ``` ## Celltype prediction *** Celltype prediction can either be performed on indiviudal cells where each cell gets a predicted celltype label, or on the level of clusters. All methods are based on similarity to other datasets, single cell or sorted bulk RNAseq, or uses know marker genes for each celltype. We will select one sample from the Covid data, `ctrl_13` and predict celltype by cell on that sample. Some methods will predict a celltype to each cell based on what it is most similar to even if the celltype of that cell is not included in the reference. Other methods include an uncertainty so that cells with low similarity scores will be unclassified. There are multiple different methods to predict celltypes, here we will just cover a few of those. Here we will use a reference PBMC dataset from the `scPred` package which is provided as a Seurat object with counts. And we will test classification based on the `scPred` and `scMap` methods. Finally we will use gene set enrichment predict celltype based on the DEGs of each cluster. # Load and process data ## Covid-19 data First, lets load required libraries and the saved object from the clustering step. Subset for one patient. ```{r} suppressPackageStartupMessages({ library(scater) library(scran) library(dplyr) library(cowplot) library(ggplot2) library(pheatmap) library(rafalib) library(scPred) library(scmap) }) ``` ```{r} #load the data and select 'ctrl_13` sample alldata <- readRDS("data/results/covid_qc_dr_int_cl.rds") ctrl.sce <- alldata[, alldata@colData$sample == 'ctrl.13'] # remove all old dimensionality reductions as they will mess up the analysis further down reducedDims(ctrl.sce) <- NULL ``` ## Reference data Then, load the reference dataset with annotated labels. Also, run all steps of the normal analysis pipeline with normalizaiton, variable gene selection, scaling and dimensionality reduction. ```{r} reference <- scPred::pbmc_1 reference ``` Convert to a SCE object. ```{r} ref.sce = Seurat::as.SingleCellExperiment(reference) ``` ## Rerun analysis pipeline Run normalization, feature selection and dimensionality reduction ```{r} #Normalize ref.sce <- computeSumFactors(ref.sce) ref.sce <- logNormCounts(ref.sce) #Variable genes var.out <- modelGeneVar(ref.sce, method="loess") hvg.ref <- getTopHVGs(var.out, n=1000) # Dim reduction ref.sce <- runPCA(ref.sce, exprs_values = "logcounts", scale = T, ncomponents = 30, subset_row = hvg.ref) ref.sce <- runUMAP(ref.sce, dimred = "PCA") ``` ```{r, fig.width=5} plotReducedDim(ref.sce,dimred = "UMAP",colour_by = "cell_type") ``` Run all steps of the analysis for the ctrl sample as well. Use the clustering from the integration lab with resolution 0.3. ```{r} #Normalize ctrl.sce <- computeSumFactors(ctrl.sce) ctrl.sce <- logNormCounts(ctrl.sce) #Variable genes var.out <- modelGeneVar(ctrl.sce, method="loess") hvg.ctrl <- getTopHVGs(var.out, n=1000) # Dim reduction ctrl.sce <- runPCA(ctrl.sce, exprs_values = "logcounts", scale = T, ncomponents = 30, subset_row = hvg.ctrl) ctrl.sce <- runUMAP(ctrl.sce, dimred = "PCA") ``` ```{r, fig.width=5} plotReducedDim(ctrl.sce,dimred = "UMAP",colour_by = "louvain_SNNk15") ``` # scMap The scMap package is one method for projecting cells from a scRNA-seq experiment on to the cell-types or individual cells identified in a different experiment. It can be run on different levels, either projecting by cluster or by single cell, here we will try out both. ## scMap cluster For scmap cell type labels must be stored in the `cell_type1` column of the `colData` slots, and gene ids that are consistent across both datasets must be stored in the `feature_symbol` column of the `rowData` slots. Then we can select variable features in both datasets. ```{r} # add in slot cell_type1 ref.sce@colData$cell_type1 = ref.sce@colData$cell_type # create a rowData slot with feature_symbol rd = data.frame(feature_symbol=rownames(ref.sce)) rownames(rd) = rownames(ref.sce) rowData(ref.sce) = rd # same for the ctrl dataset # create a rowData slot with feature_symbol rd = data.frame(feature_symbol=rownames(ctrl.sce)) rownames(rd) = rownames(ctrl.sce) rowData(ctrl.sce) = rd # select features counts(ctrl.sce) <- as.matrix(counts(ctrl.sce)) logcounts(ctrl.sce) <- as.matrix(logcounts(ctrl.sce)) ctrl.sce <- selectFeatures(ctrl.sce, suppress_plot = TRUE) counts(ref.sce) <- as.matrix(counts(ref.sce)) logcounts(ref.sce) <- as.matrix(logcounts(ref.sce)) ref.sce <- selectFeatures(ref.sce, suppress_plot = TRUE) ``` Then we need to index the reference dataset by cluster, default is the clusters in `cell_type1`. ```{r} ref.sce <- indexCluster(ref.sce) ``` Now we project the Covid-19 dataset onto that index. ```{r} project_cluster <- scmapCluster( projection = ctrl.sce, index_list = list( ref = metadata(ref.sce)$scmap_cluster_index ) ) # projected labels table(project_cluster$scmap_cluster_labs) ``` Then add the predictions to metadata and plot umap. ```{r} # add in predictions ctrl.sce@colData$scmap_cluster <- project_cluster$scmap_cluster_labs plotReducedDim(ctrl.sce,dimred = "UMAP",colour_by = "scmap_cluster") ``` ## scMap cell We can instead index the refernce data based on each single cell and project our data onto the closest neighbor in that dataset. ```{r} ref.sce <- indexCell(ref.sce) ``` Again we need to index the reference dataset. ```{r} project_cell <- scmapCell( projection = ctrl.sce, index_list = list( ref = metadata(ref.sce)$scmap_cell_index ) ) ``` We now get a table with index for the 5 nearest neigbors in the reference dataset for each cell in our dataset. We will select the celltype of the closest neighbor and assign it to the data. ```{r} cell_type_pred <- colData(ref.sce)$cell_type1[project_cell$ref[[1]][1,]] table(cell_type_pred) ``` Then add the predictions to metadata and plot umap. ```{r} # add in predictions ctrl.sce@colData$scmap_cell <- cell_type_pred plotReducedDim(ctrl.sce,dimred = "UMAP",colour_by = "scmap_cell") ``` Plot both: ```{r} cowplot::plot_grid( ncol = 2, plotReducedDim(ctrl.sce,dimred = "UMAP",colour_by = "scmap_cluster"), plotReducedDim(ctrl.sce,dimred = "UMAP",colour_by = "scmap_cell") ) ``` # scPred scPred will train a classifier based on all principal components. First, `getFeatureSpace` will create a scPred object stored in the `@misc` slot where it extracts the PCs that best separates the different celltypes. Then `trainModel` will do the actual training for each celltype. scPred works with Seurat objects, so we will convert both objects to seurat objects. You may see a lot of warnings about renaming things, but as long as you do not see an Error, you should be fine. ```{r} suppressPackageStartupMessages(library(Seurat)) reference <- Seurat::as.Seurat(ref.sce) ctrl <- Seurat::as.Seurat(ctrl.sce) ``` The loadings matrix is lost when converted to Seurat object, and scPred needs that information. So we need to rerun PCA with Seurat and the same hvgs. ```{r} VariableFeatures(reference) = hvg.ref reference <- reference %>% ScaleData(verbose=F) %>% RunPCA(verbose=F) VariableFeatures(ctrl) = hvg.ctrl ctrl <- ctrl %>% ScaleData(verbose=F) %>% RunPCA(verbose=F) ``` ```{r} reference <- getFeatureSpace(reference, "cell_type") reference <- trainModel(reference) ``` We can then print how well the training worked for the different celltypes by printing the number of PCs used for each, the ROC value and Sensitivity/Specificity. Which celltypes do you think are harder to classify based on this dataset? ```{r} get_scpred(reference) ``` You can optimize parameters for each dataset by chaning parameters and testing different types of models, see more at: https://powellgenomicslab.github.io/scPred/articles/introduction.html. But for now, we will continue with this model. Now, lets predict celltypes on our data, where scPred will align the two datasets with Harmony and then perform classification. ```{r} ctrl <- scPredict(ctrl, reference) ``` ```{r} DimPlot(ctrl, group.by = "scpred_prediction", label = T, repel = T) + NoAxes() ``` Now plot how many cells of each celltypes can be found in each cluster. ```{r} ggplot(ctrl@meta.data, aes(x=louvain_SNNk15, fill = scpred_prediction)) + geom_bar() + theme_classic() ``` Add the predictions into the SCE object ```{r} ctrl.sce@colData$scpred_prediction = ctrl$scpred_prediction ``` # Compare results Now we will compare the output of the two methods using the convenient function in scPred `crossTab` that prints the overlap between two metadata slots. ```{r} crossTab(ctrl, "scmap_cell", "scpred_prediction") ``` # GSEA with celltype markers Another option, where celltype can be classified on cluster level is to use gene set enrichment among the DEGs with known markers for different celltypes. Similar to how we did functional enrichment for the DEGs in the Differential expression exercise. There are some resources for celltype gene sets that can be used. Such as [CellMarker](http://bio-bigdata.hrbmu.edu.cn/CellMarker/), [PanglaoDB](https://panglaodb.se/) or celltype gene sets at [MSigDB](https://www.gsea-msigdb.org/gsea/msigdb/index.jsp). We can also look at overlap between DEGs in a reference dataset and the dataset you are analysing. ## DEG overlap First, lets extract top DEGs for our Covid-19 dataset and the reference dataset. When we run differential expression for our dataset, we want to report as many genes as possible, hence we set the cutoffs quite lenient. ```{r} # run differential expression in our dataset, using clustering at resolution 0.3 DGE_list <- scran::findMarkers( x = alldata, groups = as.character(alldata@colData$louvain_SNNk15), pval.type = "all", min.prop = 0) ``` ```{r} # Compute differential gene expression in reference dataset (that has cell annotation) ref_DGE <- scran::findMarkers( x = ref.sce, groups = as.character(ref.sce@colData$cell_type), pval.type = "all", direction = "up") # Identify the top cell marker genes in reference dataset # select top 50 with hihgest foldchange among top 100 signifcant genes. ref_list <- lapply(ref_DGE, function(x){ x$logFC <- rowSums( as.matrix( x[,grep("logFC",colnames(x))] )) x %>% as.data.frame() %>% filter(p.value < 0.01) %>% top_n(-100, p.value) %>% top_n(50, logFC) %>% rownames() } ) unlist(lapply(ref_list, length)) ``` Now we can run GSEA for the DEGs from our dataset and check for enrichment of top DEGs in the reference dataset. ```{r} suppressPackageStartupMessages(library(fgsea)) # run fgsea for each of the clusters in the list res <- lapply(DGE_list, function(x){ x$logFC <- rowSums( as.matrix( x[,grep("logFC",colnames(x))] )) gene_rank <- setNames(x$logFC, rownames(x)) fgseaRes <- fgsea( pathways=ref_list, stats=gene_rank,nperm=10000) return(fgseaRes) }) names(res) <- names(DGE_list) # You can filter and resort the table based on ES, NES or pvalue res <- lapply(res, function(x) {x[ x$pval < 0.1 , ]} ) res <- lapply(res, function(x) {x[ x$size > 2 , ]} ) res <- lapply(res, function(x) {x[order(x$NES,decreasing = T), ]} ) res ``` Selecing top significant overlap per cluster, we can now rename the clusters according to the predicted labels. OBS! Be aware that if you have some clusters that have bad p-values for all the gene sets, the cluster label will not be very reliable. Also, the gene sets you are using may not cover all the celltypes you have in your dataset and hence predictions may just be the most similar celltype. Also, some of the clusters have very similar p-values to multiple celltypes, for instance the ncMono and cMono celltypes are equally good for some clusters. ```{r} new.cluster.ids <- unlist(lapply(res,function(x){as.data.frame(x)[1,1]})) alldata@colData$ref_gsea <- new.cluster.ids[as.character(alldata@colData$louvain_SNNk15)] cowplot::plot_grid( ncol = 2, plotReducedDim(alldata, dimred = "UMAP",colour_by = "louvain_SNNk15"), plotReducedDim(alldata, dimred = "UMAP",colour_by = "ref_gsea") ) ``` Compare to results with the other celltype prediction methods in the ctrl_13 sample. ```{r, fig.width=10} ctrl.sce@colData$ref_gsea = alldata@colData$ref_gsea[alldata@colData$sample == "ctrl.13"] cowplot::plot_grid( ncol = 3, plotReducedDim(ctrl.sce, dimred = "UMAP",colour_by = "ref_gsea"), plotReducedDim(ctrl.sce, dimred = "UMAP",colour_by = "scmap_cell"), plotReducedDim(ctrl.sce, dimred = "UMAP",colour_by = "scpred_prediction") ) ``` ## With annotated gene sets First download celltype gene sets from CellMarker. ```{r} # Download gene marker list if(!dir.exists("data/CellMarker_list/")) { dir.create("data/CellMarker_list") download.file(url="http://bio-bigdata.hrbmu.edu.cn/CellMarker/download/Human_cell_markers.txt", destfile = "./data/CellMarker_list/Human_cell_markers.txt") download.file(url="http://bio-bigdata.hrbmu.edu.cn/CellMarker/download/Mouse_cell_markers.txt", destfile = "./data/CellMarker_list/Mouse_cell_markers.txt") } ``` Read in the gene lists and do some filering. ```{r} # Load the human marker table markers <- read.delim("data/CellMarker_list/Human_cell_markers.txt") markers <- markers [ markers$speciesType == "Human", ] markers <- markers [ markers$cancerType == "Normal", ] #Filter by tissue (to reduce computational time and have tissue-specific classification) # sort(unique(markers$tissueType)) # grep("blood",unique(markers$tissueType),value = T) # markers <- markers [ markers$tissueType %in% c("Blood","Venous blood", # "Serum","Plasma", # "Spleen","Bone marrow","Lymph node"), ] # remove strange characters etc. celltype_list <- lapply( unique(markers$cellName) , function(x){ x <- paste(markers$geneSymbol[markers$cellName == x],sep=",") x <- gsub("[[]|[]]| |-",",",x) x <- unlist(strsplit( x , split = ",")) x <- unique(x [ ! x %in% c("","NA","family") ]) x <- casefold(x,upper = T) }) names(celltype_list) <- unique(markers$cellName) # celltype_list <- lapply(celltype_list , function(x) {x[1:min(length(x),50)]} ) celltype_list <- celltype_list[ unlist(lapply(celltype_list,length)) < 100 ] celltype_list <- celltype_list[ unlist(lapply(celltype_list,length)) > 5 ] ``` ```{r} # run fgsea for each of the clusters in the list res <- lapply(DGE_list, function(x){ x$logFC <- rowSums( as.matrix( x[,grep("logFC",colnames(x))] )) gene_rank <- setNames(x$logFC, rownames(x)) fgseaRes <- fgsea( pathways=celltype_list, stats=gene_rank,nperm=10000) return(fgseaRes) }) names(res) <- names(DGE_list) # You can filter and resort the table based on ES, NES or pvalue res <- lapply(res, function(x) {x[ x$pval < 0.01 , ]} ) res <- lapply(res, function(x) {x[ x$size > 5 , ]} ) res <- lapply(res, function(x) {x[order(x$NES,decreasing = T), ]} ) # show top 3 for each cluster. lapply(res,head,3) ``` #CT_GSEA8: ```{r} new.cluster.ids <- unlist(lapply(res,function(x){as.data.frame(x)[1,1]})) alldata@colData$cellmarker_gsea <- new.cluster.ids[as.character(alldata@colData$louvain_SNNk15)] cowplot::plot_grid( ncol = 2, plotReducedDim(alldata,dimred = "UMAP",colour_by = "cellmarker_gsea"), plotReducedDim(alldata,dimred = "UMAP",colour_by = "ref_gsea") ) ``` Do you think that the methods overlap well? Where do you see the most inconsistencies? In this case we do not have any ground truth, and we cannot say wich method performs best. You should keep in mind, that any celltype classification method is just a prediction, and you still need to use your common sense and knowledge of the biological system to judge if the results make sense. # Save data Finally, lets save the data with predictions. ```{r} saveRDS(ctrl.sce,"data/results/ctrl13_qc_dr_int_cl_celltype.rds") ``` ### Session Info *** ```{r} sessionInfo() ```