{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "Attaching package: ‘proxy’\n", "\n", "The following object is masked from ‘package:Matrix’:\n", "\n", " as.matrix\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " as.dist, dist\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " as.matrix\n", "\n", "Loading required package: ggplot2\n", "\n", "Attaching package: ‘cowplot’\n", "\n", "The following object is masked from ‘package:ggplot2’:\n", "\n", " ggsave\n", "\n", "Loading required package: MASS\n", "Loading required package: GenomicRanges\n", "Loading required package: stats4\n", "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:Matrix’:\n", "\n", " colMeans, colSums, rowMeans, rowSums, which\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n", " colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n", " Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n", " lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n", " pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n", " rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n", " unsplit, which, which.max, which.min\n", "\n", "Loading required package: S4Vectors\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following object is masked from ‘package:Matrix’:\n", "\n", " expand\n", "\n", "The following objects are masked from ‘package:data.table’:\n", "\n", " first, second\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "\n", "Attaching package: ‘IRanges’\n", "\n", "The following object is masked from ‘package:data.table’:\n", "\n", " shift\n", "\n", "Loading required package: GenomeInfoDb\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "Loading required package: BiocParallel\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " aperm, apply\n", "\n" ] } ], "source": [ "library(data.table)\n", "library(Matrix)\n", "library(proxy)\n", "library(irlba)\n", "library(umap)\n", "library(data.table)\n", "library(cowplot)\n", "library(Matrix)\n", "library(BuenColors)\n", "library(scales)\n", "library(SummarizedExperiment)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocess" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_count = readRDS('../../input/bonemarrow_noisy_p2.rds')\n", "\n", "df_sample = read.table('../../input/metadata.tsv',stringsAsFactors = FALSE)\n", "\n", "df_region = data.frame(do.call(rbind,strsplit(rownames(df_count),'_')),stringsAsFactors = FALSE)\n", "colnames(df_region) <- c(\"chr\", \"start\", \"end\")\n", "df_region$start = as.integer(df_region$start)\n", "df_region$end = as.integer(df_region$end)\n", "\n", "gr_region = makeGRangesFromDataFrame(df_region, keep.extra.columns = TRUE)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Make RangedSummarizedExperiment\n", "se <- SummarizedExperiment(\n", " rowRanges = gr_region,\n", " colData = df_sample,\n", " assays = list(counts = df_count)\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "class: RangedSummarizedExperiment \n", "dim: 156311 1200 \n", "metadata(0):\n", "assays(1): counts\n", "rownames(156311): chr1_9942_10442 chr1_11036_11536 ...\n", " chrX_155258908_155259408 chrX_155259892_155260392\n", "rowData names(0):\n", "colnames(1200): CD4_1 CD4_2 ... NK_1199 NK_1200\n", "colData names(1): label" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "se" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "run_pca <- function(mat,num_pcs=50,scale=FALSE,center=FALSE){\n", " set.seed(2019) \n", " mat = as.matrix(mat)\n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)\n", " diag(sk_diag) = SVD$d\n", " SVD_vd = t(sk_diag %*% t(SVD$v))\n", " return(t(SVD_vd))\n", "}\n", "\n", "elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){\n", " set.seed(2019) \n", " mat = data.matrix(mat)\n", " SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)\n", " options(repr.plot.width=width, repr.plot.height=height)\n", " df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);\n", "# print(SVD$d[1:num_pcs])\n", " p <- ggplot(df_plot, aes(x = PC, y = SD)) +\n", " geom_point(col=\"#cd5c5c\",size = 1) + \n", " ggtitle(title)\n", " return(p)\n", "}\n", "\n", "run_umap <- function(fm_mat){\n", " umap_object = umap(t(fm_mat),random_state = 2019)\n", " df_umap = umap_object$layout\n", " return(df_umap)\n", "}\n", "\n", "\n", "plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){\n", " set.seed(2019) \n", " df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE)\n", " colnames(df_umap) = c('umap1','umap2','celltype')\n", " df_umap$umap1 = as.numeric(df_umap$umap1)\n", " df_umap$umap2 = as.numeric(df_umap$umap2)\n", " options(repr.plot.width=4, repr.plot.height=4)\n", " p <- ggplot(shuf(df_umap), aes(x = umap1, y = umap2, color = celltype)) +\n", " geom_point(size = 1) + scale_color_manual(values = colormap) +\n", " ggtitle(title)\n", " return(p)\n", "}\n", "\n", "run_control <- function(datafr,num_pcs=10){\n", " mat = as.matrix(datafr)\n", " fm_control = run_pca(mat,num_pcs = num_pcs)\n", " return(fm_control)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Analysis" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "workdir = './peaks_frequency_results/'\n", "path_umap = paste0(workdir,'umap_rds/')\n", "path_fm = paste0(workdir,'feature_matrices/')\n", "system(paste0('mkdir -p ',path_umap))\n", "system(paste0('mkdir -p ',path_fm))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "datafr = assays(se)$counts" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "