--- title: #INTEG_TITLE: author: "Åsa Björklund & Paulo Czarnewski" date: '`r format(Sys.Date(), "%B %d, %Y")`' output: html_document: self_contained: true highlight: tango df_print: paged toc: yes toc_float: collapsed: false smooth_scroll: true toc_depth: 3 keep_md: yes fig_caption: true html_notebook: self_contained: true highlight: tango df_print: paged toc: yes toc_float: collapsed: false smooth_scroll: true toc_depth: 3 editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, warning=FALSE, result='hold',fig.width=12,tidy=TRUE) knitr::opts_knit$set(progress=TRUE,verbose=TRUE) ``` In this tutorial we will look at different ways of integrating multiple single cell RNA-seq datasets. We will explore two different methods to correct for batch effects across datasets. We will also look at a quantitative measure to assess the quality of the integrated data. Seurat uses the data integration method presented in Comprehensive Integration of Single Cell Data, while Scran and Scanpy use a mutual Nearest neighbour method (MNN). Below you can find a list of the most recent methods for single data integration: Markdown | Language | Library | Ref --- | --- | --- | --- CCA | R | Seurat | [Cell](https://www.sciencedirect.com/science/article/pii/S0092867419305598?via%3Dihub) MNN | R/Python | Scater/Scanpy | [Nat. Biotech.](https://www.nature.com/articles/nbt.4091) Conos | R | conos | [Nat. Methods](https://www.nature.com/articles/s41592-019-0466-z?error=cookies_not_supported&code=5680289b-6edb-40ad-9934-415dac4fdb2f) Scanorama | Python | scanorama | [Nat. Biotech.](https://www.nature.com/articles/s41587-019-0113-3) Let's first load necessary libraries and the data saved in the previous lab. ```{r, message='hide',warning='hide',results='hold'} suppressPackageStartupMessages({ library(Seurat) library(cowplot) library(ggplot2) }) alldata <- readRDS("data/results/covid_qc_dr.rds") print(names(alldata@reductions)) ``` We split the combined object into a list, with each dataset as an element. We perform standard preprocessing (log-normalization), and identify variable features individually for each dataset based on a variance stabilizing transformation ("vst"). ```{r, message='hide',warning='hide',results='hold'} alldata.list <- SplitObject(alldata, split.by = "orig.ident") for (i in 1:length(alldata.list)) { alldata.list[[i]] <- NormalizeData(alldata.list[[i]], verbose = FALSE) alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst", nfeatures = 2000,verbose = FALSE) } hvgs_per_dataset <- lapply(alldata.list, function(x) { x@assays$RNA@var.features }) # venn::venn(hvgs_per_dataset,opacity = .4,zcolor = scales::hue_pal()(3),cexsn = 1,cexil = 1,lwd=1,col="white",frame=F,borders = NA) temp <- unique(unlist(hvgs_per_dataset)) overlap <- sapply( hvgs_per_dataset , function(x) { temp %in% x } ) pheatmap::pheatmap(t(overlap*1),cluster_rows = F , color = c("grey90","grey20")) ``` We identify anchors using the FindIntegrationAnchors function, which takes a list of Seurat objects as input. ```{r, message='hide',warning='hide',results='hold'} alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:30,reduction = "cca") ``` We then pass these anchors to the IntegrateData function, which returns a Seurat object. ```{r, message='hide',warning='hide',results='hold'} alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA") ``` We can observe that a new assay slot is now created under the name `CCA`. ```{r, message='hide',warning='hide',results='hold'} names(alldata.int@assays) # by default, Seurat now sets the integrated assay as the default assay, so any operation you now perform will be on the ingegrated data. alldata.int@active.assay ``` After running IntegrateData, the Seurat object will contain a new Assay with the integrated (or ‘batch-corrected’) expression matrix. Note that the original (uncorrected values) are still stored in the object in the “RNA” assay, so you can switch back and forth. We can then use this new integrated matrix for downstream analysis and visualization. Here we scale the integrated data, run PCA, and visualize the results with UMAP and TSNE. The integrated datasets cluster by cell type, instead of by technology. ```{r, message='hide',warning='hide',results='hold'} #Run Dimensionality reduction on integrated space alldata.int <- ScaleData(alldata.int, verbose = FALSE) alldata.int <- RunPCA(alldata.int, npcs = 30, verbose = FALSE) alldata.int <- RunUMAP(alldata.int, dims = 1:30) alldata.int <- RunTSNE(alldata.int, dims = 1:30) ``` We can now plot the un-integrated and the integrated space reduced dimensions. ```{r, message='hide',warning='hide',results='hold',fig.asp=.55,fig.width=16} plot_grid(ncol = 3, DimPlot(alldata, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA raw_data"), DimPlot(alldata, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE raw_data"), DimPlot(alldata, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP raw_data"), DimPlot(alldata.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"), DimPlot(alldata.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"), DimPlot(alldata.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated") ) ``` Let's plot some marker genes for different celltypes onto the embedding. Some genes are: Markers | Cell Type --- | --- CD3E | T cells CD3E CD4 | CD4+ T cells CD3E CD8A | CD8+ T cells GNLY, NKG7 | NK cells MS4A1 | B cells CD14, LYZ, CST3, MS4A7 | CD14+ Monocytes FCGR3A, LYZ, CST3, MS4A7 | FCGR3A+ Monocytes FCER1A, CST3 | DCs ```{r,message='hide',warning='hide', results='hold',results='hold',fig.asp=.65,fig.width=16} myfeatures <- c("CD3E","CD4","CD8A","NKG7","GNLY","MS4A1","CD14","LYZ","MS4A7","FCGR3A","CST3","FCER1A") plot_list <- list() for(i in myfeatures){ plot_list[[i]] <- FeaturePlot(alldata, reduction = "umap",dims = 1:2, features = i,ncol = 3,order = T) + NoLegend() + NoAxes() + NoGrid() } plot_grid(ncol=3, plotlist = plot_list) ``` ```{r,message='hide',warning='hide', results='hold',results='hold',fig.height=5,fig.width=16} library(harmony) alldata.harmony <- RunHarmony( alldata, group.by.vars = "orig.ident", reduction = "pca", dims.use = 1:50, assay.use = "RNA") #Here we use all PCs computed from Harmony for UMAP calculation alldata.int[["harmony"]] <- alldata.harmony[["harmony"]] alldata.int <- RunUMAP(alldata.int, dims = 1:50, reduction = "harmony", reduction.name = "umap_harmony") ``` ```{r,message='hide',warning='hide', results='hold',results='hold',fig.height=5,fig.width=16} hvgs <- unique(unlist(hvgs_per_dataset)) assaylist <- list() genelist <- list() for(i in 1:length(alldata.list)) { assaylist[[i]] <- t(as.matrix(GetAssayData(alldata.list[[i]], "data")[hvgs,])) genelist[[i]] <- hvgs } lapply(assaylist,dim) ``` ```{r,message='hide',warning='hide', results='hold',results='hold',fig.height=5,fig.width=16} library(reticulate) scanorama <- import("scanorama") integrated.data <- scanorama$integrate(datasets_full = assaylist, genes_list = genelist ) intdimred <- do.call(rbind, integrated.data[[1]]) colnames(intdimred) <- paste0("PC_", 1:100) rownames(intdimred) <- colnames(alldata.int) # Add standard deviations in order to draw Elbow Plots in Seurat stdevs <- apply(intdimred, MARGIN = 2, FUN = sd) alldata.int[["scanorama"]] <- CreateDimReducObject( embeddings = intdimred, stdev = stdevs, key = "PC_", assay = "RNA") #Here we use all PCs computed from Scanorama for UMAP calculation alldata.int <- RunUMAP(alldata.int, dims = 1:100, reduction = "scanorama",reduction.name = "umap_scanorama") ``` ```{r, message='hide',warning='hide',results='hold',fig.asp=.55,fig.width=16} p1 <- DimPlot(alldata, reduction = "umap", group.by = "orig.ident")+ggtitle("UMAP raw_data") p2 <- DimPlot(alldata.int, reduction = "umap", group.by = "orig.ident")+ggtitle("UMAP CCA") p3 <- DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident")+ggtitle("UMAP Harmony") p4 <- DimPlot(alldata.int, reduction = "umap_scanorama", group.by = "orig.ident")+ggtitle("UMAP Scanorama") leg <- get_legend(p1) gridExtra::grid.arrange( gridExtra::arrangeGrob( p1 + NoLegend() + NoAxes(), p2 + NoLegend() + NoAxes(), p3 + NoLegend() + NoAxes(), p4 + NoLegend() + NoAxes(), nrow=2), leg, ncol=2,widths=c(8,2) ) ``` Finally, lets save the integrated data for further analysis. ```{r,message='hide',warning='hide', results='hold',results='hold',fig.height=5,fig.width=16} saveRDS(alldata.int,"data/results/covid_qc_dr_int.rds") ``` ### Session Info *** ```{r} sessionInfo() ```