{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "Attaching package: ‘dplyr’\n", "\n", "The following objects are masked from ‘package:data.table’:\n", "\n", " between, first, last\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " filter, lag\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " intersect, setdiff, setequal, union\n", "\n", "Loading required package: MASS\n", "\n", "Attaching package: ‘MASS’\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " select\n", "\n", "Loading required package: ggplot2\n", "\n", "Attaching package: ‘cowplot’\n", "\n", "The following object is masked from ‘package:ggplot2’:\n", "\n", " ggsave\n", "\n", "Loading required package: GenomicRanges\n", "Loading required package: stats4\n", "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " combine, intersect, setdiff, union\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n", " colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n", " Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n", " lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n", " pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n", " rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n", " unsplit, which, which.max, which.min\n", "\n", "Loading required package: S4Vectors\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " first, rename\n", "\n", "The following objects are masked from ‘package:data.table’:\n", "\n", " first, second\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "\n", "Attaching package: ‘IRanges’\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " collapse, desc, slice\n", "\n", "The following object is masked from ‘package:data.table’:\n", "\n", " shift\n", "\n", "Loading required package: GenomeInfoDb\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " count\n", "\n", "Loading required package: BiocParallel\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " aperm, apply\n", "\n", "\n", "Loading required package: BSgenome\n", "Loading required package: Biostrings\n", "Loading required package: XVector\n", "\n", "Attaching package: ‘Biostrings’\n", "\n", "The following object is masked from ‘package:DelayedArray’:\n", "\n", " type\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " strsplit\n", "\n", "Loading required package: rtracklayer\n", "Loading required package: Matrix\n", "\n", "Attaching package: ‘Matrix’\n", "\n", "The following object is masked from ‘package:S4Vectors’:\n", "\n", " expand\n", "\n", "Loading required package: monocle\n", "Loading required package: VGAM\n", "Loading required package: splines\n", "Loading required package: DDRTree\n", "Loading required package: Gviz\n", "Loading required package: grid\n", "Warning message:\n", "\"replacing previous import 'GenomicRanges::shift' by 'data.table::shift' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::last' by 'dplyr::last' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::union' by 'dplyr::union' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::intersect' by 'dplyr::intersect' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::setdiff' by 'dplyr::setdiff' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::first' by 'dplyr::first' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::between' by 'dplyr::between' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::failwith' by 'plyr::failwith' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::id' by 'plyr::id' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::summarize' by 'plyr::summarize' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::count' by 'plyr::count' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::desc' by 'plyr::desc' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::mutate' by 'plyr::mutate' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::arrange' by 'plyr::arrange' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::rename' by 'plyr::rename' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::summarise' by 'plyr::summarise' when loading 'cisTopic'\"Loading required package: mclust\n", "Package 'mclust' version 5.4.3\n", "Type 'citation(\"mclust\")' for citing this R package in publications.\n", "Loading required package: tsne\n" ] } ], "source": [ "library(data.table)\n", "library(dplyr)\n", "library(BuenColors)\n", "library(stringr)\n", "library(cowplot)\n", "library(SummarizedExperiment)\n", "library(chromVAR)\n", "library(BSgenome.Hsapiens.UCSC.hg19)\n", "library(JASPAR2016)\n", "library(motifmatchr)\n", "library(GenomicRanges)\n", "library(irlba)\n", "library(cicero)\n", "library(umap)\n", "library(cisTopic)\n", "library(prabclus)\n", "library(BrockmanR)\n", "library(jackstraw)\n", "library(Matrix)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### define functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "select_peaks <- function (se,n_peaks = 80000){\n", " datafr = assays(se)$count\n", " binary_mat = as.matrix((datafr > 0) + 0)\n", " binary_mat = Matrix(binary_mat, sparse = TRUE) \n", " num_cells_ncounted = Matrix::rowSums(binary_mat)\n", " indices_sorted = order(num_cells_ncounted,decreasing = TRUE)[1:n_peaks]\n", " indices = sort(indices_sorted)\n", " ncounts = binary_mat[indices,]\n", " ncounts = ncounts[Matrix::rowSums(ncounts) > 0,] \n", " new_counts = Matrix::colSums(ncounts)\n", " options(repr.plot.width=8, repr.plot.height=4)\n", " par(mfrow=c(1,2))\n", " hist(log10(num_cells_ncounted),\n", " xlab = 'log10(Number of Cells)',\n", " main=\"No. of Cells Each Site is Observed In\",breaks=50)\n", " abline(v=log10(num_cells_ncounted[indices_sorted[length(indices_sorted)]]),\n", " lwd=2,col=\"indianred\")\n", " hist(log10(new_counts),\n", " xlab = 'log10(Number of Sites)',\n", " main=\"Number of Sites Each Cell Uses\",breaks=50)\n", " return(se[rownames(ncounts),])\n", "}\n", "\n", "\n", "run_pca <- function(mat,num_pcs=50,remove_first_PC=FALSE,scale=FALSE,center=FALSE){\n", " set.seed(2019) \n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)\n", " diag(sk_diag) = SVD$d\n", " if(remove_first_PC){\n", " sk_diag[1,1] = 0\n", " SVD_vd = (sk_diag %*% t(SVD$v))[2:num_pcs,]\n", " }else{\n", " SVD_vd = sk_diag %*% t(SVD$v)\n", " }\n", " return(SVD_vd)\n", "}\n", "\n", "elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){\n", " set.seed(2019) \n", " mat = data.matrix(mat)\n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " options(repr.plot.width=width, repr.plot.height=height)\n", " df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);\n", "# print(SVD$d[1:num_pcs])\n", " p <- ggplot(df_plot, aes(x = PC, y = SD)) +\n", " geom_point(col=\"#cd5c5c\",size = 1) + \n", " ggtitle(title)\n", " return(p)\n", "}\n", "\n", "run_umap <- function(fm_mat){\n", " umap_object = umap(t(fm_mat),random_state = 2019)\n", " df_umap = umap_object$layout\n", " return(df_umap)\n", "}\n", "\n", "\n", "plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){\n", " set.seed(2019) \n", " df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE)\n", " colnames(df_umap) = c('umap_1','umap_2','label')\n", " df_umap$umap_1 = as.numeric(df_umap$umap_1)\n", " df_umap$umap_2 = as.numeric(df_umap$umap_2)\n", " options(repr.plot.width=4, repr.plot.height=4)\n", " p <- ggplot(shuf(df_umap), aes(x = umap_1, y = umap_2, color = label)) +\n", " geom_point(size = 1) + scale_color_manual(values = colormap) +\n", " ggtitle(title)\n", " return(p)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Input" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "workdir = '../output/'\n", "path_umap = paste0(workdir,'umap_rds/')\n", "path_fm = paste0(workdir,'feature_matrices/')\n", "system(paste0('mkdir -p ',path_umap))\n", "system(paste0('mkdir -p ',path_fm))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_count = readRDS('../input/bonemarrow_cov2500.rds')\n", "\n", "df_sample = read.table('../input/metadata.tsv',stringsAsFactors = FALSE)\n", "\n", "df_region = data.frame(do.call(rbind,strsplit(rownames(df_count),'_')),stringsAsFactors = FALSE)\n", "colnames(df_region) <- c(\"chr\", \"start\", \"end\")\n", "df_region$start = as.integer(df_region$start)\n", "df_region$end = as.integer(df_region$end)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "