{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "Attaching package: ‘dplyr’\n", "\n", "The following objects are masked from ‘package:data.table’:\n", "\n", " between, first, last\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " filter, lag\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " intersect, setdiff, setequal, union\n", "\n", "Loading required package: MASS\n", "\n", "Attaching package: ‘MASS’\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " select\n", "\n", "Loading required package: ggplot2\n", "\n", "Attaching package: ‘cowplot’\n", "\n", "The following object is masked from ‘package:ggplot2’:\n", "\n", " ggsave\n", "\n", "Loading required package: GenomicRanges\n", "Loading required package: stats4\n", "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:Matrix’:\n", "\n", " colMeans, colSums, rowMeans, rowSums, which\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " combine, intersect, setdiff, union\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n", " colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n", " Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n", " lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n", " pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n", " rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n", " unsplit, which, which.max, which.min\n", "\n", "Loading required package: S4Vectors\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following object is masked from ‘package:Matrix’:\n", "\n", " expand\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " first, rename\n", "\n", "The following objects are masked from ‘package:data.table’:\n", "\n", " first, second\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "\n", "Attaching package: ‘IRanges’\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " collapse, desc, slice\n", "\n", "The following object is masked from ‘package:data.table’:\n", "\n", " shift\n", "\n", "Loading required package: GenomeInfoDb\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " count\n", "\n", "Loading required package: BiocParallel\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " aperm, apply\n", "\n", "\n", "Loading required package: BSgenome\n", "Loading required package: Biostrings\n", "Loading required package: XVector\n", "\n", "Attaching package: ‘Biostrings’\n", "\n", "The following object is masked from ‘package:DelayedArray’:\n", "\n", " type\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " strsplit\n", "\n", "Loading required package: rtracklayer\n", "Loading required package: monocle\n", "Loading required package: VGAM\n", "Loading required package: splines\n", "Loading required package: DDRTree\n", "Loading required package: Gviz\n", "Loading required package: grid\n", "Warning message:\n", "\"replacing previous import 'GenomicRanges::shift' by 'data.table::shift' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::last' by 'dplyr::last' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::union' by 'dplyr::union' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::intersect' by 'dplyr::intersect' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'GenomicRanges::setdiff' by 'dplyr::setdiff' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::first' by 'dplyr::first' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'data.table::between' by 'dplyr::between' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::failwith' by 'plyr::failwith' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::id' by 'plyr::id' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::summarize' by 'plyr::summarize' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::count' by 'plyr::count' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::desc' by 'plyr::desc' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::mutate' by 'plyr::mutate' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::arrange' by 'plyr::arrange' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::rename' by 'plyr::rename' when loading 'cisTopic'\"Warning message:\n", "\"replacing previous import 'dplyr::summarise' by 'plyr::summarise' when loading 'cisTopic'\"Loading required package: mclust\n", "Package 'mclust' version 5.4.3\n", "Type 'citation(\"mclust\")' for citing this R package in publications.\n", "Loading required package: tsne\n" ] } ], "source": [ "library(data.table)\n", "library(dplyr)\n", "library(Matrix)\n", "library(BuenColors)\n", "library(stringr)\n", "library(cowplot)\n", "library(SummarizedExperiment)\n", "library(chromVAR)\n", "library(BSgenome.Hsapiens.UCSC.hg19)\n", "library(JASPAR2016)\n", "library(motifmatchr)\n", "library(GenomicRanges)\n", "library(irlba)\n", "library(cicero)\n", "library(umap)\n", "library(cisTopic)\n", "library(prabclus)\n", "library(BrockmanR)\n", "library(jackstraw)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### define functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "read_FM <- function(filename){\n", " df_FM = data.frame(readRDS(filename),stringsAsFactors=FALSE,check.names=FALSE)\n", " rownames(df_FM) <- make.names(rownames(df_FM), unique=TRUE)\n", " df_FM[is.na(df_FM)] <- 0\n", " return(df_FM)\n", "}\n", "\n", "run_pca <- function(mat,num_pcs=50,remove_first_PC=FALSE,scale=FALSE,center=FALSE){\n", " set.seed(2019) \n", " mat = as.matrix(mat)\n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)\n", " diag(sk_diag) = SVD$d\n", " if(remove_first_PC){\n", " sk_diag[1,1] = 0\n", " SVD_vd = (sk_diag %*% t(SVD$v))[2:num_pcs,]\n", " }else{\n", " SVD_vd = sk_diag %*% t(SVD$v)\n", " }\n", " return(SVD_vd)\n", "}\n", "\n", "elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){\n", " set.seed(2019) \n", " mat = data.matrix(mat)\n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " options(repr.plot.width=width, repr.plot.height=height)\n", " df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);\n", "# print(SVD$d[1:num_pcs])\n", " p <- ggplot(df_plot, aes(x = PC, y = SD)) +\n", " geom_point(col=\"#cd5c5c\",size = 1) + \n", " ggtitle(title)\n", " return(p)\n", "}\n", "\n", "run_umap <- function(fm_mat){\n", " umap_object = umap(t(fm_mat),random_state = 2019)\n", " df_umap = umap_object$layout\n", " return(df_umap)\n", "}\n", "\n", "\n", "plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){\n", " set.seed(2019) \n", " df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE)\n", " colnames(df_umap) = c('umap_1','umap_2','celltype')\n", " df_umap$umap_1 = as.numeric(df_umap$umap_1)\n", " df_umap$umap_2 = as.numeric(df_umap$umap_2)\n", " options(repr.plot.width=4, repr.plot.height=4)\n", " p <- ggplot(shuf(df_umap), aes(x = umap_1, y = umap_2, color = celltype)) +\n", " geom_point(size = 1) + scale_color_manual(values = colormap) +\n", " ggtitle(title)\n", " return(p)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Input" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "workdir = './output/'\n", "\n", "path_umap = paste0(workdir,'umap_rds/')\n", "system(paste0('mkdir -p ',path_umap))\n", "\n", "path_fm = paste0(workdir,'feature_matrices/')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "metadata <- read.table('./input/metadata.tsv',\n", " header = TRUE,\n", " stringsAsFactors=FALSE,quote=\"\",row.names=1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "