{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Installation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`devtools::install_github(\"zji90/SCRATdatamm9\")` \n", "`source(\"https://raw.githubusercontent.com/zji90/SCRATdata/master/installcode.R\")` " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n", " colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n", " Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n", " lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n", " pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n", " rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n", " unsplit, which, which.max, which.min\n", "\n", "Loading required package: S4Vectors\n", "Loading required package: stats4\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "Loading required package: GenomeInfoDb\n", "Loading required package: GenomicRanges\n", "Loading required package: SummarizedExperiment\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "Loading required package: BiocParallel\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " aperm, apply\n", "\n", "Loading required package: Biostrings\n", "Loading required package: XVector\n", "\n", "Attaching package: ‘Biostrings’\n", "\n", "The following object is masked from ‘package:DelayedArray’:\n", "\n", " type\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " strsplit\n", "\n", "Loading required package: Rsamtools\n", "Warning message:\n", "“replacing previous import ‘DT::dataTableOutput’ by ‘shiny::dataTableOutput’ when loading ‘SCRAT’”Warning message:\n", "“replacing previous import ‘DT::renderDataTable’ by ‘shiny::renderDataTable’ when loading ‘SCRAT’”Warning message:\n", "“replacing previous import ‘mclust::em’ by ‘shiny::em’ when loading ‘SCRAT’”" ] } ], "source": [ "library(devtools)\n", "library(GenomicAlignments)\n", "library(Rsamtools)\n", "library(SCRATdatahg19)\n", "library(SCRAT)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Obtain Feature Matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "start_time = Sys.time()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "metadata <- read.table('../../input/metadata.tsv',\n", " header = TRUE,\n", " stringsAsFactors=FALSE,quote=\"\",row.names=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "SCRATsummary <- function (dir = \"\", genome, bamfile = NULL, singlepair = \"automated\", \n", " removeblacklist = T, log2transform = T, adjustlen = T, featurelist = c(\"GENE\", \n", " \"ENCL\", \"MOTIF_TRANSFAC\", \"MOTIF_JASPAR\", \"GSEA\"), customfeature = NULL, \n", " Genestarttype = \"TSSup\", Geneendtype = \"TSSdown\", Genestartbp = 3000, \n", " Geneendbp = 1000, ENCLclunum = 2000, Motifflank = 100, GSEAterm = \"c5.bp\", \n", " GSEAstarttype = \"TSSup\", GSEAendtype = \"TSSdown\", GSEAstartbp = 3000, \n", " GSEAendbp = 1000) \n", "{\n", " if (is.null(bamfile)) {\n", " bamfile <- list.files(dir, pattern = \".bam$\")\n", " }\n", " datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n", " genome))\n", " bamdata <- list()\n", " for (i in bamfile) {\n", " filepath <- file.path(dir, i)\n", " if (singlepair == \"automated\") {\n", " bamfile <- BamFile(filepath)\n", " tmpsingle <- readGAlignments(bamfile)\n", " tmppair <- readGAlignmentPairs(bamfile)\n", " pairendtf <- testPairedEndBam(bamfile)\n", " if (pairendtf) {\n", " tmp <- tmppair\n", " startpos <- pmin(start(first(tmp)), start(last(tmp)))\n", " endpos <- pmax(end(first(tmp)), end(last(tmp)))\n", " id <- which(!is.na(as.character(seqnames(tmp))))\n", " tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n", " }\n", " else {\n", " tmp <- GRanges(tmpsingle)\n", " }\n", " }\n", " else if (singlepair == \"single\") {\n", " tmp <- GRanges(readGAlignments(filepath))\n", " }\n", " else if (singlepair == \"pair\") {\n", " tmp <- readGAlignmentPairs(filepath)\n", " startpos <- pmin(start(first(tmp)), start(last(tmp)))\n", " endpos <- pmax(end(first(tmp)), end(last(tmp)))\n", " id <- which(!is.na(as.character(seqnames(tmp))))\n", " tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n", " }\n", " if (removeblacklist) {\n", " load(paste0(datapath, \"/gr/blacklist.rda\"))\n", " tmp <- tmp[-as.matrix(findOverlaps(tmp, gr))[, 1], \n", " ]\n", " }\n", " bamdata[[i]] <- tmp\n", " }\n", " bamsummary <- sapply(bamdata, length)\n", " allres <- NULL\n", " datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n", " genome))\n", " if (\"GENE\" %in% featurelist) {\n", " print(\"Processing GENE features\")\n", " load(paste0(datapath, \"/gr/generegion.rda\"))\n", " if (Genestarttype == \"TSSup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) - as.numeric(Genestartbp), end(gr) + \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TSSdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) + as.numeric(Genestartbp), end(gr) - \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TESup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) - as.numeric(Genestartbp), start(gr) + \n", " as.numeric(Genestartbp))\n", " }\n", " else if (Genestarttype == \"TESdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) + as.numeric(Genestartbp), start(gr) - \n", " as.numeric(Genestartbp))\n", " }\n", " if (Geneendtype == \"TSSup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) - as.numeric(Geneendbp), end(gr) + \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TSSdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " start(gr) + as.numeric(Geneendbp), end(gr) - \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TESup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) - as.numeric(Geneendbp), start(gr) + \n", " as.numeric(Geneendbp))\n", " }\n", " else if (Geneendtype == \"TESdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \"+\", \n", " end(gr) + as.numeric(Geneendbp), start(gr) - \n", " as.numeric(Geneendbp))\n", " }\n", " ngr <- names(gr)\n", " gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n", " grend), end = pmax(grstart, grend)))\n", " names(gr) <- ngr\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- end(gr) - start(gr) + 1\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"ENCL\" %in% featurelist) {\n", " print(\"Processing ENCL features\")\n", " load(paste0(datapath, \"/gr/ENCL\", ENCLclunum, \".rda\"))\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"MOTIF_TRANSFAC\" %in% featurelist) {\n", " print(\"Processing MOTIF_TRANSFAC features\")\n", " load(paste0(datapath, \"/gr/transfac1.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " load(paste0(datapath, \"/gr/transfac2.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " if (genome %in% c(\"hg19\", \"hg38\")) {\n", " load(paste0(datapath, \"/gr/transfac3.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - \n", " start(i) + 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " }\n", " if (\"MOTIF_JASPAR\" %in% featurelist) {\n", " print(\"Processing MOTIF_JASPAR features\")\n", " load(paste0(datapath, \"/gr/jaspar1.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " load(paste0(datapath, \"/gr/jaspar2.rda\"))\n", " gr <- flank(gr, as.numeric(Motifflank), both = T)\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n", " 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " if (\"GSEA\" %in% featurelist) {\n", " print(\"Processing GSEA features\")\n", " for (i in GSEAterm) {\n", " load(paste0(datapath, \"/gr/GSEA\", i, \".rda\"))\n", " allgr <- gr\n", " for (sgrn in names(allgr)) {\n", " gr <- allgr[[sgrn]]\n", " if (GSEAstarttype == \"TSSup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) - as.numeric(GSEAstartbp), \n", " end(gr) + as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TSSdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) + as.numeric(GSEAstartbp), \n", " end(gr) - as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TESup\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) - as.numeric(GSEAstartbp), start(gr) + \n", " as.numeric(GSEAstartbp))\n", " }\n", " else if (GSEAstarttype == \"TESdown\") {\n", " grstart <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) + as.numeric(GSEAstartbp), start(gr) - \n", " as.numeric(GSEAstartbp))\n", " }\n", " if (GSEAendtype == \"TSSup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) - as.numeric(GSEAendbp), end(gr) + \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TSSdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", start(gr) + as.numeric(GSEAendbp), end(gr) - \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TESup\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) - as.numeric(GSEAendbp), start(gr) + \n", " as.numeric(GSEAendbp))\n", " }\n", " else if (GSEAendtype == \"TESdown\") {\n", " grend <- ifelse(as.character(strand(gr)) == \n", " \"+\", end(gr) + as.numeric(GSEAendbp), start(gr) - \n", " as.numeric(GSEAendbp))\n", " }\n", " ngr <- names(gr)\n", " gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n", " grend), end = pmax(grstart, grend)))\n", " names(gr) <- ngr\n", " allgr[[sgrn]] <- gr\n", " }\n", " gr <- allgr\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- sapply(gr, function(i) sum(end(i) - \n", " start(i) + 1))\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " }\n", " if (\"Custom\" %in% featurelist) {\n", " print(\"Processing custom features\")\n", " gr <- read.table(customfeature, as.is = T, sep = \"\\t\")\n", " gr <- GRanges(seqnames = gr[, 1], IRanges(start = gr[, \n", " 2], end = gr[, 3]))\n", " tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n", " i))\n", " tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n", " if (log2transform) {\n", " tmp <- log2(tmp + 1)\n", " }\n", " if (adjustlen) {\n", " grrange <- end(gr) - start(gr) + 1\n", " tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n", " }\n", " tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n", " allres <- rbind(allres, tmp)\n", " }\n", " allres\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_out <- SCRATsummary(dir = \"../../input/sc-bams_nodup/\", \n", " genome = \"mm9\",\n", " featurelist=\"MOTIF_JASPAR\",\n", " log2transform = FALSE, adjustlen = FALSE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end_time <- Sys.time()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Time difference of 1.188708 days" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "end_time - start_time" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 519
  2. \n", "\t
  3. 12178
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 519\n", "\\item 12178\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 519\n", "2. 12178\n", "\n", "\n" ], "text/plain": [ "[1] 519 12178" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bamBoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bamBoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bamBoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bamBoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam
MOTIF:MA0002.2:RUNX1779.67587 786.52359 679.96058 584.14298 832.58595
MOTIF:MA0003.3:TFAP2A760.69499 739.16432 593.73370 680.10076 1234.67862
MOTIF:MA0004.1:Arnt508.10337 528.14579 408.81688 394.62636 689.08819
MOTIF:MA0006.1:Ahr::Arnt727.11345 816.49781 618.65979 638.11923 1053.81166
MOTIF:MA0007.3:Ar 54.02249 52.15515 48.25807 56.37519 62.78027
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & BoneMarrow\\_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam & BoneMarrow\\_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam & BoneMarrow\\_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam & BoneMarrow\\_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam & BoneMarrow\\_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam\\\\\n", "\\hline\n", "\tMOTIF:MA0002.2:RUNX1 & 779.67587 & 786.52359 & 679.96058 & 584.14298 & 832.58595\\\\\n", "\tMOTIF:MA0003.3:TFAP2A & 760.69499 & 739.16432 & 593.73370 & 680.10076 & 1234.67862\\\\\n", "\tMOTIF:MA0004.1:Arnt & 508.10337 & 528.14579 & 408.81688 & 394.62636 & 689.08819\\\\\n", "\tMOTIF:MA0006.1:Ahr::Arnt & 727.11345 & 816.49781 & 618.65979 & 638.11923 & 1053.81166\\\\\n", "\tMOTIF:MA0007.3:Ar & 54.02249 & 52.15515 & 48.25807 & 56.37519 & 62.78027\\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "| | BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam | BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam | BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam |\n", "|---|---|---|---|---|---|\n", "| MOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |\n", "| MOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |\n", "| MOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |\n", "| MOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |\n", "| MOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |\n", "\n" ], "text/plain": [ " BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam\n", "MOTIF:MA0002.2:RUNX1 779.67587 \n", "MOTIF:MA0003.3:TFAP2A 760.69499 \n", "MOTIF:MA0004.1:Arnt 508.10337 \n", "MOTIF:MA0006.1:Ahr::Arnt 727.11345 \n", "MOTIF:MA0007.3:Ar 54.02249 \n", " BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam\n", "MOTIF:MA0002.2:RUNX1 786.52359 \n", "MOTIF:MA0003.3:TFAP2A 739.16432 \n", "MOTIF:MA0004.1:Arnt 528.14579 \n", "MOTIF:MA0006.1:Ahr::Arnt 816.49781 \n", "MOTIF:MA0007.3:Ar 52.15515 \n", " BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam\n", "MOTIF:MA0002.2:RUNX1 679.96058 \n", "MOTIF:MA0003.3:TFAP2A 593.73370 \n", "MOTIF:MA0004.1:Arnt 408.81688 \n", "MOTIF:MA0006.1:Ahr::Arnt 618.65979 \n", "MOTIF:MA0007.3:Ar 48.25807 \n", " BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam\n", "MOTIF:MA0002.2:RUNX1 584.14298 \n", "MOTIF:MA0003.3:TFAP2A 680.10076 \n", "MOTIF:MA0004.1:Arnt 394.62636 \n", "MOTIF:MA0006.1:Ahr::Arnt 638.11923 \n", "MOTIF:MA0007.3:Ar 56.37519 \n", " BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam\n", "MOTIF:MA0002.2:RUNX1 832.58595 \n", "MOTIF:MA0003.3:TFAP2A 1234.67862 \n", "MOTIF:MA0004.1:Arnt 689.08819 \n", "MOTIF:MA0006.1:Ahr::Arnt 1053.81166 \n", "MOTIF:MA0007.3:Ar 62.78027 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dim(df_out)\n", "df_out[1:5,1:5]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'
  2. \n", "\t
  3. 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'
  4. \n", "\t
  5. 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'
  6. \n", "\t
  7. 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'
  8. \n", "\t
  9. 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'
  10. \n", "\t
  11. 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'
  12. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'\n", "\\item 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'\n", "\\item 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'\n", "\\item 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'\n", "\\item 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'\n", "\\item 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'\n", "2. 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'\n", "3. 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'\n", "4. 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'\n", "5. 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'\n", "6. 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'\n", "\n", "\n" ], "text/plain": [ "[1] \"AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT\"\n", "[2] \"AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC\"\n", "[3] \"AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC\"\n", "[4] \"AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC\"\n", "[5] \"AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\"\n", "[6] \"AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT\"" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "head(sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 519
  2. \n", "\t
  3. 12178
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 519\n", "\\item 12178\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 519\n", "2. 12178\n", "\n", "\n" ], "text/plain": [ "[1] 519 12178" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGTAGCGATAGAATATTACTTTCCGCGGACTGTACTGACAGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGCAGCGATAGAGATTACGTTGCGCAATGACGTACTGACAGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC
MOTIF:MA0002.2:RUNX1779.67587 786.52359 679.96058 584.14298 832.58595
MOTIF:MA0003.3:TFAP2A760.69499 739.16432 593.73370 680.10076 1234.67862
MOTIF:MA0004.1:Arnt508.10337 528.14579 408.81688 394.62636 689.08819
MOTIF:MA0006.1:Ahr::Arnt727.11345 816.49781 618.65979 638.11923 1053.81166
MOTIF:MA0007.3:Ar 54.02249 52.15515 48.25807 56.37519 62.78027
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT & AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC & AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC & AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC & AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\\\\\n", "\\hline\n", "\tMOTIF:MA0002.2:RUNX1 & 779.67587 & 786.52359 & 679.96058 & 584.14298 & 832.58595\\\\\n", "\tMOTIF:MA0003.3:TFAP2A & 760.69499 & 739.16432 & 593.73370 & 680.10076 & 1234.67862\\\\\n", "\tMOTIF:MA0004.1:Arnt & 508.10337 & 528.14579 & 408.81688 & 394.62636 & 689.08819\\\\\n", "\tMOTIF:MA0006.1:Ahr::Arnt & 727.11345 & 816.49781 & 618.65979 & 638.11923 & 1053.81166\\\\\n", "\tMOTIF:MA0007.3:Ar & 54.02249 & 52.15515 & 48.25807 & 56.37519 & 62.78027\\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "| | AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC |\n", "|---|---|---|---|---|---|\n", "| MOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |\n", "| MOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |\n", "| MOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |\n", "| MOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |\n", "| MOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |\n", "\n" ], "text/plain": [ " AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT\n", "MOTIF:MA0002.2:RUNX1 779.67587 \n", "MOTIF:MA0003.3:TFAP2A 760.69499 \n", "MOTIF:MA0004.1:Arnt 508.10337 \n", "MOTIF:MA0006.1:Ahr::Arnt 727.11345 \n", "MOTIF:MA0007.3:Ar 54.02249 \n", " AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC\n", "MOTIF:MA0002.2:RUNX1 786.52359 \n", "MOTIF:MA0003.3:TFAP2A 739.16432 \n", "MOTIF:MA0004.1:Arnt 528.14579 \n", "MOTIF:MA0006.1:Ahr::Arnt 816.49781 \n", "MOTIF:MA0007.3:Ar 52.15515 \n", " AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC\n", "MOTIF:MA0002.2:RUNX1 679.96058 \n", "MOTIF:MA0003.3:TFAP2A 593.73370 \n", "MOTIF:MA0004.1:Arnt 408.81688 \n", "MOTIF:MA0006.1:Ahr::Arnt 618.65979 \n", "MOTIF:MA0007.3:Ar 48.25807 \n", " AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC\n", "MOTIF:MA0002.2:RUNX1 584.14298 \n", "MOTIF:MA0003.3:TFAP2A 680.10076 \n", "MOTIF:MA0004.1:Arnt 394.62636 \n", "MOTIF:MA0006.1:Ahr::Arnt 638.11923 \n", "MOTIF:MA0007.3:Ar 56.37519 \n", " AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\n", "MOTIF:MA0002.2:RUNX1 832.58595 \n", "MOTIF:MA0003.3:TFAP2A 1234.67862 \n", "MOTIF:MA0004.1:Arnt 689.08819 \n", "MOTIF:MA0006.1:Ahr::Arnt 1053.81166 \n", "MOTIF:MA0007.3:Ar 62.78027 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "colnames(df_out) = sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2)\n", "dim(df_out)\n", "df_out[1:5,1:5]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 519
  2. \n", "\t
  3. 12178
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 519\n", "\\item 12178\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 519\n", "2. 12178\n", "\n", "\n" ], "text/plain": [ "[1] 519 12178" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "if(! all(colnames(df_out) == rownames(metadata))){\n", " df_out = df_out[,rownames(metadata)]\n", " dim(df_out)\n", "}" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "saveRDS(df_out, file = '../../output/feature_matrices/FM_SCRAT_cusanovich2018subset_motifs.rds')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "R version 3.5.1 (2018-07-02)\n", "Platform: x86_64-conda_cos6-linux-gnu (64-bit)\n", "Running under: CentOS Linux 7 (Core)\n", "\n", "Matrix products: default\n", "BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_SCRAT/lib/R/lib/libRblas.so\n", "\n", "locale:\n", " [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C \n", " [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 \n", " [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 \n", " [7] LC_PAPER=en_US.UTF-8 LC_NAME=C \n", " [9] LC_ADDRESS=C LC_TELEPHONE=C \n", "[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C \n", "\n", "attached base packages:\n", "[1] stats4 parallel stats graphics grDevices utils datasets \n", "[8] methods base \n", "\n", "other attached packages:\n", " [1] SCRAT_0.99.0 SCRATdatahg19_0.99.1 \n", " [3] GenomicAlignments_1.18.1 Rsamtools_1.34.0 \n", " [5] Biostrings_2.50.2 XVector_0.22.0 \n", " [7] SummarizedExperiment_1.12.0 DelayedArray_0.8.0 \n", " [9] BiocParallel_1.16.6 matrixStats_0.54.0 \n", "[11] Biobase_2.42.0 GenomicRanges_1.34.0 \n", "[13] GenomeInfoDb_1.18.1 IRanges_2.16.0 \n", "[15] S4Vectors_0.20.1 BiocGenerics_0.28.0 \n", "[17] usethis_1.5.0 devtools_2.0.2 \n", "\n", "loaded via a namespace (and not attached):\n", " [1] tsne_0.1-3 bitops_1.0-6 fs_1.2.7 \n", " [4] RColorBrewer_1.1-2 rprojroot_1.3-2 repr_0.19.2 \n", " [7] tools_3.5.1 backports_1.1.4 R6_2.4.0 \n", "[10] DT_0.5 KernSmooth_2.23-15 lazyeval_0.2.2 \n", "[13] colorspace_1.4-1 withr_2.1.2 tidyselect_0.2.5 \n", "[16] prettyunits_1.0.2 processx_3.3.0 compiler_3.5.1 \n", "[19] cli_1.1.0 desc_1.2.0 caTools_1.17.1.2 \n", "[22] scales_1.0.0 callr_3.2.0 pbdZMQ_0.3-3 \n", "[25] stringr_1.4.0 digest_0.6.18 shinyBS_0.61 \n", "[28] scatterD3_0.9 dbscan_1.1-3 base64enc_0.1-3 \n", "[31] pkgconfig_2.0.2 htmltools_0.3.6 sessioninfo_1.1.1 \n", "[34] htmlwidgets_1.3 rlang_0.3.4 shiny_1.3.2 \n", "[37] jsonlite_1.6 mclust_5.4.3 gtools_3.8.1 \n", "[40] dplyr_0.8.0.1 RCurl_1.95-4.11 magrittr_1.5 \n", "[43] GenomeInfoDbData_1.2.0 Matrix_1.2-17 Rcpp_1.0.1 \n", "[46] IRkernel_0.8.15 munsell_0.5.0 stringi_1.4.3 \n", "[49] zlibbioc_1.28.0 pkgbuild_1.0.3 gplots_3.0.1.1 \n", "[52] plyr_1.8.4 grid_3.5.1 gdata_2.18.0 \n", "[55] promises_1.0.1 crayon_1.3.4 lattice_0.20-38 \n", "[58] IRdisplay_0.7.0 ps_1.3.0 pillar_1.3.1 \n", "[61] uuid_0.1-2 reshape2_1.4.3 pkgload_1.0.2 \n", "[64] glue_1.3.1 evaluate_0.13 remotes_2.0.4 \n", "[67] httpuv_1.5.1 gtable_0.3.0 purrr_0.3.2 \n", "[70] assertthat_0.2.1 ggplot2_3.1.1 mime_0.6 \n", "[73] xtable_1.8-4 later_0.8.0 tibble_2.1.1 \n", "[76] pheatmap_1.0.12 memoise_1.1.0 ellipse_0.4.1 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sessionInfo()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "save.image(file = 'SCRAT_cusanovich2018subset.RData')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R [conda env:ATACseq_SCRAT]", "language": "R", "name": "conda-env-ATACseq_SCRAT-r" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 2 }