{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Installation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`devtools::install_github(\"zji90/SCRATdatamm9\")` \n", "`source(\"https://raw.githubusercontent.com/zji90/SCRATdata/master/installcode.R\")` " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n", " colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n", " Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n", " lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n", " pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n", " rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n", " unsplit, which, which.max, which.min\n", "\n", "Loading required package: S4Vectors\n", "Loading required package: stats4\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "Loading required package: GenomeInfoDb\n", "Loading required package: GenomicRanges\n", "Loading required package: SummarizedExperiment\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "Loading required package: BiocParallel\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " aperm, apply\n", "\n", "Loading required package: Biostrings\n", "Loading required package: XVector\n", "\n", "Attaching package: ‘Biostrings’\n", "\n", "The following object is masked from ‘package:DelayedArray’:\n", "\n", " type\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " strsplit\n", "\n", "Loading required package: Rsamtools\n", "Warning message:\n", "“replacing previous import ‘DT::dataTableOutput’ by ‘shiny::dataTableOutput’ when loading ‘SCRAT’”Warning message:\n", "“replacing previous import ‘DT::renderDataTable’ by ‘shiny::renderDataTable’ when loading ‘SCRAT’”Warning message:\n", "“replacing previous import ‘mclust::em’ by ‘shiny::em’ when loading ‘SCRAT’”" ] } ], "source": [ "library(devtools)\n", "library(GenomicAlignments)\n", "library(Rsamtools)\n", "library(SCRATdatahg19)\n", "library(SCRAT)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Obtain Feature Matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "start_time = Sys.time()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "metadata <- read.table('../input/metadata.tsv',\n", " header = TRUE,\n", " stringsAsFactors=FALSE,quote=\"\",row.names=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "SCRATsummary <- function (dir = \"\", genome, bamfile = NULL, singlepair = \"automated\", \n", " removeblacklist = T, log2transform = T, adjustlen = T, featurelist = c(\"GENE\", \n", " \"ENCL\", \"MOTIF_TRANSFAC\", \"MOTIF_JASPAR\", \"GSEA\"), customfeature = NULL, \n", " Genestarttype = \"TSSup\", Geneendtype = \"TSSdown\", Genestartbp = 3000, \n", " Geneendbp = 1000, ENCLclunum = 2000, Motifflank = 100, GSEAterm = \"c5.bp\", \n", " GSEAstarttype = \"TSSup\", GSEAendtype = \"TSSdown\", GSEAstartbp = 3000, \n", " GSEAendbp = 1000) \n", "{\n", " if (is.null(bamfile)) {\n", " bamfile <- list.files(dir, pattern = \".bam$\")\n", " }\n", " datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n", " genome))\n", " bamdata <- list()\n", " for (i in bamfile) {\n", " filepath <- file.path(dir, i)\n", " if (singlepair == \"automated\") {\n", " bamfile <- BamFile(filepath)\n", " tmpsingle <- readGAlignments(bamfile)\n", " tmppair <- readGAlignmentPairs(bamfile)\n", " pairendtf <- testPairedEndBam(bamfile)\n", " if (pairendtf) {\n", " tmp <- tmppair\n", " startpos <- pmin(start(first(tmp)), start(last(tmp)))\n", " endpos <- pmax(end(first(tmp)), end(last(tmp)))\n", " id <- which(!is.na(as.character(seqnames(tmp))))\n", " tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n", " }\n", " else {\n", " tmp <- GRanges(tmpsingle)\n", " }\n", " }\n", " else if (singlepair == \"single\") {\n", " tmp <- GRanges(readGAlignments(filepath))\n", " }\n", " else if (singlepair == \"pair\") {\n", " tmp <- readGAlignmentPairs(filepath)\n", " startpos <- pmin(start(first(tmp)), start(last(tmp)))\n", " endpos <- pmax(end(first(tmp)), end(last(tmp)))\n", " id <- which(!is.na(as.character(seqnames(tmp))))\n", " tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n", " }\n", " if (removeblacklist) {\n", " load(paste0(datapath, \"/gr/blacklist.rda\"))\n", " tmp <- tmp[-as.matrix(findOverlaps(tmp, gr))[, 1], \n", " ]\n", " }\n", " bamdata[[i]] <- tmp\n", " }\n", " bamsummary <- sapply(bamdata, length)\n", " allres <- NULL\n", " datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n", " genome))\n", " if (\"GENE\" %in% featurelist) {\n", " print(\"Processing GENE features\")\n", " load(paste0(datapath, \"/gr/generegion.rda\"))\n", " if (Genestarttype == \"TSSup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) - as.numeric(Genestartbp), end(gr) + \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TSSdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) + as.numeric(Genestartbp), end(gr) - \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TESup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) - as.numeric(Genestartbp), start(gr) + \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TESdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) + as.numeric(Genestartbp), start(gr) - \n", " as.numeric(Genestartbp))\n", " }\n", " if (Geneendtype == \"TSSup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) - as.numeric(Geneendbp), end(gr) + \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TSSdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) + as.numeric(Geneendbp), end(gr) - \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TESup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) - as.numeric(Geneendbp), start(gr) + \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TESdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) + as.numeric(Geneendbp), start(gr) - \n", " as.numeric(Geneendbp))\n", " }\n", " ngr <- names(gr)\n", " gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n", " grend), end = pmax(grstart, grend)))\n", " names(gr) <- ngr\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- end(gr) - start(gr) + 1\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"ENCL\" %in% featurelist) {\n", " print(\"Processing ENCL features\")\n", " load(paste0(datapath, \"/gr/ENCL\", ENCLclunum, \".rda\"))\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"MOTIF_TRANSFAC\" %in% featurelist) {\n", " print(\"Processing MOTIF_TRANSFAC features\")\n", " load(paste0(datapath, \"/gr/transfac1.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " load(paste0(datapath, \"/gr/transfac2.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " if (genome %in% c(\"hg19\", \"hg38\")) {\n", " load(paste0(datapath, \"/gr/transfac3.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - \n", " start(i) + 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " }\n", " if (\"MOTIF_JASPAR\" %in% featurelist) {\n", " print(\"Processing MOTIF_JASPAR features\")\n", " load(paste0(datapath, \"/gr/jaspar1.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " load(paste0(datapath, \"/gr/jaspar2.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"GSEA\" %in% featurelist) {\n", " print(\"Processing GSEA features\")\n", " for (i in GSEAterm) {\n", " load(paste0(datapath, \"/gr/GSEA\", i, \".rda\"))\n", " allgr <- gr\n", " for (sgrn in names(allgr)) {\n", " gr <- allgr[[sgrn]]\n", " if (GSEAstarttype == \"TSSup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) - as.numeric(GSEAstartbp), \n", " end(gr) + as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TSSdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) + as.numeric(GSEAstartbp), \n", " end(gr) - as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TESup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) - as.numeric(GSEAstartbp), start(gr) + \n", " as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TESdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) + as.numeric(GSEAstartbp), start(gr) - \n", " as.numeric(GSEAstartbp))\n", " }\n", " if (GSEAendtype == \"TSSup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) - as.numeric(GSEAendbp), end(gr) + \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TSSdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) + as.numeric(GSEAendbp), end(gr) - \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TESup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) - as.numeric(GSEAendbp), start(gr) + \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TESdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) + as.numeric(GSEAendbp), start(gr) - \n", " as.numeric(GSEAendbp))\n", " }\n", " ngr <- names(gr)\n", " gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n", " grend), end = pmax(grstart, grend)))\n", " names(gr) <- ngr\n", " allgr[[sgrn]] <- gr\n", " }\n", " gr <- allgr\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - \n", " start(i) + 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " }\n", " if (\"Custom\" %in% featurelist) {\n", " print(\"Processing custom features\")\n", " gr <- read.table(customfeature, as.is = T, sep = \"\\t\")\n", " gr <- GRanges(seqnames = gr[, 1], IRanges(start = gr[, \n", " 2], end = gr[, 3]))\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- end(gr) - start(gr) + 1\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " allres\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_out <- SCRATsummary(dir = \"../input/sc-bams_nodup/\", \n", " genome = \"mm9\",\n", " featurelist=\"MOTIF_JASPAR\",\n", " removeblacklist = FALSE,\n", " log2transform = FALSE, adjustlen = FALSE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end_time <- Sys.time()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end_time - start_time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "dim(df_out)\n", "df_out[1:5,1:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "head(sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "colnames(df_out) = sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2)\n", "dim(df_out)\n", "df_out[1:5,1:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if(! all(colnames(df_out) == rownames(metadata))){\n", " df_out = df_out[,rownames(metadata)]\n", " dim(df_out)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "saveRDS(df_out, file = '../output/feature_matrices/FM_SCRAT_cusanovich2018subset_motifs_no_blacklist_filtering.rds')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sessionInfo()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "save.image(file = '../output/SCRAT_cusanovich2018subset_no_blacklist_filtering.RData')" ] } ], "metadata": { "kernelspec": { "display_name": "R [conda env:ATACseq_SCRAT]", "language": "R", "name": "conda-env-ATACseq_SCRAT-r" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 2 }