{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Installation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`devtools::install_github(\"zji90/SCRATdatamm9\")` \n",
"`source(\"https://raw.githubusercontent.com/zji90/SCRATdata/master/installcode.R\")` "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading required package: BiocGenerics\n",
"Loading required package: parallel\n",
"\n",
"Attaching package: ‘BiocGenerics’\n",
"\n",
"The following objects are masked from ‘package:parallel’:\n",
"\n",
" clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
" clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
" parLapplyLB, parRapply, parSapply, parSapplyLB\n",
"\n",
"The following objects are masked from ‘package:stats’:\n",
"\n",
" IQR, mad, sd, var, xtabs\n",
"\n",
"The following objects are masked from ‘package:base’:\n",
"\n",
" anyDuplicated, append, as.data.frame, basename, cbind, colMeans,\n",
" colnames, colSums, dirname, do.call, duplicated, eval, evalq,\n",
" Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,\n",
" lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,\n",
" pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,\n",
" rowSums, sapply, setdiff, sort, table, tapply, union, unique,\n",
" unsplit, which, which.max, which.min\n",
"\n",
"Loading required package: S4Vectors\n",
"Loading required package: stats4\n",
"\n",
"Attaching package: ‘S4Vectors’\n",
"\n",
"The following object is masked from ‘package:base’:\n",
"\n",
" expand.grid\n",
"\n",
"Loading required package: IRanges\n",
"Loading required package: GenomeInfoDb\n",
"Loading required package: GenomicRanges\n",
"Loading required package: SummarizedExperiment\n",
"Loading required package: Biobase\n",
"Welcome to Bioconductor\n",
"\n",
" Vignettes contain introductory material; view with\n",
" 'browseVignettes()'. To cite Bioconductor, see\n",
" 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
"\n",
"Loading required package: DelayedArray\n",
"Loading required package: matrixStats\n",
"\n",
"Attaching package: ‘matrixStats’\n",
"\n",
"The following objects are masked from ‘package:Biobase’:\n",
"\n",
" anyMissing, rowMedians\n",
"\n",
"Loading required package: BiocParallel\n",
"\n",
"Attaching package: ‘DelayedArray’\n",
"\n",
"The following objects are masked from ‘package:matrixStats’:\n",
"\n",
" colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
"\n",
"The following objects are masked from ‘package:base’:\n",
"\n",
" aperm, apply\n",
"\n",
"Loading required package: Biostrings\n",
"Loading required package: XVector\n",
"\n",
"Attaching package: ‘Biostrings’\n",
"\n",
"The following object is masked from ‘package:DelayedArray’:\n",
"\n",
" type\n",
"\n",
"The following object is masked from ‘package:base’:\n",
"\n",
" strsplit\n",
"\n",
"Loading required package: Rsamtools\n",
"Warning message:\n",
"“replacing previous import ‘DT::dataTableOutput’ by ‘shiny::dataTableOutput’ when loading ‘SCRAT’”Warning message:\n",
"“replacing previous import ‘DT::renderDataTable’ by ‘shiny::renderDataTable’ when loading ‘SCRAT’”Warning message:\n",
"“replacing previous import ‘mclust::em’ by ‘shiny::em’ when loading ‘SCRAT’”"
]
}
],
"source": [
"library(devtools)\n",
"library(GenomicAlignments)\n",
"library(Rsamtools)\n",
"library(SCRATdatahg19)\n",
"library(SCRAT)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Obtain Feature Matrix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"start_time = Sys.time()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"metadata <- read.table('../../input/metadata.tsv',\n",
" header = TRUE,\n",
" stringsAsFactors=FALSE,quote=\"\",row.names=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"SCRATsummary <- function (dir = \"\", genome, bamfile = NULL, singlepair = \"automated\", \n",
" removeblacklist = T, log2transform = T, adjustlen = T, featurelist = c(\"GENE\", \n",
" \"ENCL\", \"MOTIF_TRANSFAC\", \"MOTIF_JASPAR\", \"GSEA\"), customfeature = NULL, \n",
" Genestarttype = \"TSSup\", Geneendtype = \"TSSdown\", Genestartbp = 3000, \n",
" Geneendbp = 1000, ENCLclunum = 2000, Motifflank = 100, GSEAterm = \"c5.bp\", \n",
" GSEAstarttype = \"TSSup\", GSEAendtype = \"TSSdown\", GSEAstartbp = 3000, \n",
" GSEAendbp = 1000) \n",
"{\n",
" if (is.null(bamfile)) {\n",
" bamfile <- list.files(dir, pattern = \".bam$\")\n",
" }\n",
" datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n",
" genome))\n",
" bamdata <- list()\n",
" for (i in bamfile) {\n",
" filepath <- file.path(dir, i)\n",
" if (singlepair == \"automated\") {\n",
" bamfile <- BamFile(filepath)\n",
" tmpsingle <- readGAlignments(bamfile)\n",
" tmppair <- readGAlignmentPairs(bamfile)\n",
" pairendtf <- testPairedEndBam(bamfile)\n",
" if (pairendtf) {\n",
" tmp <- tmppair\n",
" startpos <- pmin(start(first(tmp)), start(last(tmp)))\n",
" endpos <- pmax(end(first(tmp)), end(last(tmp)))\n",
" id <- which(!is.na(as.character(seqnames(tmp))))\n",
" tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n",
" }\n",
" else {\n",
" tmp <- GRanges(tmpsingle)\n",
" }\n",
" }\n",
" else if (singlepair == \"single\") {\n",
" tmp <- GRanges(readGAlignments(filepath))\n",
" }\n",
" else if (singlepair == \"pair\") {\n",
" tmp <- readGAlignmentPairs(filepath)\n",
" startpos <- pmin(start(first(tmp)), start(last(tmp)))\n",
" endpos <- pmax(end(first(tmp)), end(last(tmp)))\n",
" id <- which(!is.na(as.character(seqnames(tmp))))\n",
" tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))\n",
" }\n",
" if (removeblacklist) {\n",
" load(paste0(datapath, \"/gr/blacklist.rda\"))\n",
" tmp <- tmp[-as.matrix(findOverlaps(tmp, gr))[, 1], \n",
" ]\n",
" }\n",
" bamdata[[i]] <- tmp\n",
" }\n",
" bamsummary <- sapply(bamdata, length)\n",
" allres <- NULL\n",
" datapath <- system.file(\"extdata\", package = paste0(\"SCRATdata\", \n",
" genome))\n",
" if (\"GENE\" %in% featurelist) {\n",
" print(\"Processing GENE features\")\n",
" load(paste0(datapath, \"/gr/generegion.rda\"))\n",
" if (Genestarttype == \"TSSup\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \"+\", \n",
" start(gr) - as.numeric(Genestartbp), end(gr) + \n",
" as.numeric(Genestartbp))\n",
" }\n",
" else if (Genestarttype == \"TSSdown\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \"+\", \n",
" start(gr) + as.numeric(Genestartbp), end(gr) - \n",
" as.numeric(Genestartbp))\n",
" }\n",
" else if (Genestarttype == \"TESup\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \"+\", \n",
" end(gr) - as.numeric(Genestartbp), start(gr) + \n",
" as.numeric(Genestartbp))\n",
" }\n",
" else if (Genestarttype == \"TESdown\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \"+\", \n",
" end(gr) + as.numeric(Genestartbp), start(gr) - \n",
" as.numeric(Genestartbp))\n",
" }\n",
" if (Geneendtype == \"TSSup\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \"+\", \n",
" start(gr) - as.numeric(Geneendbp), end(gr) + \n",
" as.numeric(Geneendbp))\n",
" }\n",
" else if (Geneendtype == \"TSSdown\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \"+\", \n",
" start(gr) + as.numeric(Geneendbp), end(gr) - \n",
" as.numeric(Geneendbp))\n",
" }\n",
" else if (Geneendtype == \"TESup\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \"+\", \n",
" end(gr) - as.numeric(Geneendbp), start(gr) + \n",
" as.numeric(Geneendbp))\n",
" }\n",
" else if (Geneendtype == \"TESdown\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \"+\", \n",
" end(gr) + as.numeric(Geneendbp), start(gr) - \n",
" as.numeric(Geneendbp))\n",
" }\n",
" ngr <- names(gr)\n",
" gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n",
" grend), end = pmax(grstart, grend)))\n",
" names(gr) <- ngr\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- end(gr) - start(gr) + 1\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" if (\"ENCL\" %in% featurelist) {\n",
" print(\"Processing ENCL features\")\n",
" load(paste0(datapath, \"/gr/ENCL\", ENCLclunum, \".rda\"))\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n",
" 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" if (\"MOTIF_TRANSFAC\" %in% featurelist) {\n",
" print(\"Processing MOTIF_TRANSFAC features\")\n",
" load(paste0(datapath, \"/gr/transfac1.rda\"))\n",
" gr <- flank(gr, as.numeric(Motifflank), both = T)\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n",
" 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" load(paste0(datapath, \"/gr/transfac2.rda\"))\n",
" gr <- flank(gr, as.numeric(Motifflank), both = T)\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n",
" 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" if (genome %in% c(\"hg19\", \"hg38\")) {\n",
" load(paste0(datapath, \"/gr/transfac3.rda\"))\n",
" gr <- flank(gr, as.numeric(Motifflank), both = T)\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - \n",
" start(i) + 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" }\n",
" if (\"MOTIF_JASPAR\" %in% featurelist) {\n",
" print(\"Processing MOTIF_JASPAR features\")\n",
" load(paste0(datapath, \"/gr/jaspar1.rda\"))\n",
" gr <- flank(gr, as.numeric(Motifflank), both = T)\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n",
" 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" load(paste0(datapath, \"/gr/jaspar2.rda\"))\n",
" gr <- flank(gr, as.numeric(Motifflank), both = T)\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - start(i) + \n",
" 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" if (\"GSEA\" %in% featurelist) {\n",
" print(\"Processing GSEA features\")\n",
" for (i in GSEAterm) {\n",
" load(paste0(datapath, \"/gr/GSEA\", i, \".rda\"))\n",
" allgr <- gr\n",
" for (sgrn in names(allgr)) {\n",
" gr <- allgr[[sgrn]]\n",
" if (GSEAstarttype == \"TSSup\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \n",
" \"+\", start(gr) - as.numeric(GSEAstartbp), \n",
" end(gr) + as.numeric(GSEAstartbp))\n",
" }\n",
" else if (GSEAstarttype == \"TSSdown\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \n",
" \"+\", start(gr) + as.numeric(GSEAstartbp), \n",
" end(gr) - as.numeric(GSEAstartbp))\n",
" }\n",
" else if (GSEAstarttype == \"TESup\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \n",
" \"+\", end(gr) - as.numeric(GSEAstartbp), start(gr) + \n",
" as.numeric(GSEAstartbp))\n",
" }\n",
" else if (GSEAstarttype == \"TESdown\") {\n",
" grstart <- ifelse(as.character(strand(gr)) == \n",
" \"+\", end(gr) + as.numeric(GSEAstartbp), start(gr) - \n",
" as.numeric(GSEAstartbp))\n",
" }\n",
" if (GSEAendtype == \"TSSup\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \n",
" \"+\", start(gr) - as.numeric(GSEAendbp), end(gr) + \n",
" as.numeric(GSEAendbp))\n",
" }\n",
" else if (GSEAendtype == \"TSSdown\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \n",
" \"+\", start(gr) + as.numeric(GSEAendbp), end(gr) - \n",
" as.numeric(GSEAendbp))\n",
" }\n",
" else if (GSEAendtype == \"TESup\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \n",
" \"+\", end(gr) - as.numeric(GSEAendbp), start(gr) + \n",
" as.numeric(GSEAendbp))\n",
" }\n",
" else if (GSEAendtype == \"TESdown\") {\n",
" grend <- ifelse(as.character(strand(gr)) == \n",
" \"+\", end(gr) + as.numeric(GSEAendbp), start(gr) - \n",
" as.numeric(GSEAendbp))\n",
" }\n",
" ngr <- names(gr)\n",
" gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart, \n",
" grend), end = pmax(grstart, grend)))\n",
" names(gr) <- ngr\n",
" allgr[[sgrn]] <- gr\n",
" }\n",
" gr <- allgr\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- sapply(gr, function(i) sum(end(i) - \n",
" start(i) + 1))\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" }\n",
" if (\"Custom\" %in% featurelist) {\n",
" print(\"Processing custom features\")\n",
" gr <- read.table(customfeature, as.is = T, sep = \"\\t\")\n",
" gr <- GRanges(seqnames = gr[, 1], IRanges(start = gr[, \n",
" 2], end = gr[, 3]))\n",
" tmp <- sapply(bamdata, function(i) countOverlaps(gr, \n",
" i))\n",
" tmp <- sweep(tmp, 2, bamsummary, \"/\") * 10000\n",
" if (log2transform) {\n",
" tmp <- log2(tmp + 1)\n",
" }\n",
" if (adjustlen) {\n",
" grrange <- end(gr) - start(gr) + 1\n",
" tmp <- sweep(tmp, 1, grrange, \"/\") * 1e+06\n",
" }\n",
" tmp <- tmp[rowSums(tmp) > 0, , drop = F]\n",
" allres <- rbind(allres, tmp)\n",
" }\n",
" allres\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_out <- SCRATsummary(dir = \"../../input/sc-bams_nodup/\", \n",
" genome = \"mm9\",\n",
" featurelist=\"MOTIF_JASPAR\",\n",
" log2transform = FALSE, adjustlen = FALSE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"end_time <- Sys.time()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Time difference of 1.188708 days"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"end_time - start_time"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\t- 519
\n",
"\t- 12178
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 519\n",
"\\item 12178\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 519\n",
"2. 12178\n",
"\n",
"\n"
],
"text/plain": [
"[1] 519 12178"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" | BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam | BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam | BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam |
\n",
"\n",
"\tMOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |
\n",
"\tMOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |
\n",
"\tMOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |
\n",
"\tMOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |
\n",
"\tMOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & BoneMarrow\\_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam & BoneMarrow\\_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam & BoneMarrow\\_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam & BoneMarrow\\_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam & BoneMarrow\\_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam\\\\\n",
"\\hline\n",
"\tMOTIF:MA0002.2:RUNX1 & 779.67587 & 786.52359 & 679.96058 & 584.14298 & 832.58595\\\\\n",
"\tMOTIF:MA0003.3:TFAP2A & 760.69499 & 739.16432 & 593.73370 & 680.10076 & 1234.67862\\\\\n",
"\tMOTIF:MA0004.1:Arnt & 508.10337 & 528.14579 & 408.81688 & 394.62636 & 689.08819\\\\\n",
"\tMOTIF:MA0006.1:Ahr::Arnt & 727.11345 & 816.49781 & 618.65979 & 638.11923 & 1053.81166\\\\\n",
"\tMOTIF:MA0007.3:Ar & 54.02249 & 52.15515 & 48.25807 & 56.37519 & 62.78027\\\\\n",
"\\end{tabular}\n"
],
"text/markdown": [
"\n",
"| | BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam | BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam | BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam | BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam |\n",
"|---|---|---|---|---|---|\n",
"| MOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |\n",
"| MOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |\n",
"| MOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |\n",
"| MOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |\n",
"| MOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |\n",
"\n"
],
"text/plain": [
" BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam\n",
"MOTIF:MA0002.2:RUNX1 779.67587 \n",
"MOTIF:MA0003.3:TFAP2A 760.69499 \n",
"MOTIF:MA0004.1:Arnt 508.10337 \n",
"MOTIF:MA0006.1:Ahr::Arnt 727.11345 \n",
"MOTIF:MA0007.3:Ar 54.02249 \n",
" BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam\n",
"MOTIF:MA0002.2:RUNX1 786.52359 \n",
"MOTIF:MA0003.3:TFAP2A 739.16432 \n",
"MOTIF:MA0004.1:Arnt 528.14579 \n",
"MOTIF:MA0006.1:Ahr::Arnt 816.49781 \n",
"MOTIF:MA0007.3:Ar 52.15515 \n",
" BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam\n",
"MOTIF:MA0002.2:RUNX1 679.96058 \n",
"MOTIF:MA0003.3:TFAP2A 593.73370 \n",
"MOTIF:MA0004.1:Arnt 408.81688 \n",
"MOTIF:MA0006.1:Ahr::Arnt 618.65979 \n",
"MOTIF:MA0007.3:Ar 48.25807 \n",
" BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam\n",
"MOTIF:MA0002.2:RUNX1 584.14298 \n",
"MOTIF:MA0003.3:TFAP2A 680.10076 \n",
"MOTIF:MA0004.1:Arnt 394.62636 \n",
"MOTIF:MA0006.1:Ahr::Arnt 638.11923 \n",
"MOTIF:MA0007.3:Ar 56.37519 \n",
" BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam\n",
"MOTIF:MA0002.2:RUNX1 832.58595 \n",
"MOTIF:MA0003.3:TFAP2A 1234.67862 \n",
"MOTIF:MA0004.1:Arnt 689.08819 \n",
"MOTIF:MA0006.1:Ahr::Arnt 1053.81166 \n",
"MOTIF:MA0007.3:Ar 62.78027 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dim(df_out)\n",
"df_out[1:5,1:5]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'
\n",
"\t- 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'
\n",
"\t- 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'
\n",
"\t- 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'
\n",
"\t- 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'
\n",
"\t- 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'\n",
"\\item 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'\n",
"\\item 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'\n",
"\\item 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'\n",
"\\item 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'\n",
"\\item 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'\n",
"2. 'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'\n",
"3. 'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'\n",
"4. 'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'\n",
"5. 'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'\n",
"6. 'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'\n",
"\n",
"\n"
],
"text/plain": [
"[1] \"AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT\"\n",
"[2] \"AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC\"\n",
"[3] \"AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC\"\n",
"[4] \"AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC\"\n",
"[5] \"AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\"\n",
"[6] \"AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT\""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"head(sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 519
\n",
"\t- 12178
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 519\n",
"\\item 12178\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 519\n",
"2. 12178\n",
"\n",
"\n"
],
"text/plain": [
"[1] 519 12178"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" | AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC |
\n",
"\n",
"\tMOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |
\n",
"\tMOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |
\n",
"\tMOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |
\n",
"\tMOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |
\n",
"\tMOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT & AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC & AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC & AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC & AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\\\\\n",
"\\hline\n",
"\tMOTIF:MA0002.2:RUNX1 & 779.67587 & 786.52359 & 679.96058 & 584.14298 & 832.58595\\\\\n",
"\tMOTIF:MA0003.3:TFAP2A & 760.69499 & 739.16432 & 593.73370 & 680.10076 & 1234.67862\\\\\n",
"\tMOTIF:MA0004.1:Arnt & 508.10337 & 528.14579 & 408.81688 & 394.62636 & 689.08819\\\\\n",
"\tMOTIF:MA0006.1:Ahr::Arnt & 727.11345 & 816.49781 & 618.65979 & 638.11923 & 1053.81166\\\\\n",
"\tMOTIF:MA0007.3:Ar & 54.02249 & 52.15515 & 48.25807 & 56.37519 & 62.78027\\\\\n",
"\\end{tabular}\n"
],
"text/markdown": [
"\n",
"| | AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC |\n",
"|---|---|---|---|---|---|\n",
"| MOTIF:MA0002.2:RUNX1 | 779.67587 | 786.52359 | 679.96058 | 584.14298 | 832.58595 |\n",
"| MOTIF:MA0003.3:TFAP2A | 760.69499 | 739.16432 | 593.73370 | 680.10076 | 1234.67862 |\n",
"| MOTIF:MA0004.1:Arnt | 508.10337 | 528.14579 | 408.81688 | 394.62636 | 689.08819 |\n",
"| MOTIF:MA0006.1:Ahr::Arnt | 727.11345 | 816.49781 | 618.65979 | 638.11923 | 1053.81166 |\n",
"| MOTIF:MA0007.3:Ar | 54.02249 | 52.15515 | 48.25807 | 56.37519 | 62.78027 |\n",
"\n"
],
"text/plain": [
" AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT\n",
"MOTIF:MA0002.2:RUNX1 779.67587 \n",
"MOTIF:MA0003.3:TFAP2A 760.69499 \n",
"MOTIF:MA0004.1:Arnt 508.10337 \n",
"MOTIF:MA0006.1:Ahr::Arnt 727.11345 \n",
"MOTIF:MA0007.3:Ar 54.02249 \n",
" AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC\n",
"MOTIF:MA0002.2:RUNX1 786.52359 \n",
"MOTIF:MA0003.3:TFAP2A 739.16432 \n",
"MOTIF:MA0004.1:Arnt 528.14579 \n",
"MOTIF:MA0006.1:Ahr::Arnt 816.49781 \n",
"MOTIF:MA0007.3:Ar 52.15515 \n",
" AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC\n",
"MOTIF:MA0002.2:RUNX1 679.96058 \n",
"MOTIF:MA0003.3:TFAP2A 593.73370 \n",
"MOTIF:MA0004.1:Arnt 408.81688 \n",
"MOTIF:MA0006.1:Ahr::Arnt 618.65979 \n",
"MOTIF:MA0007.3:Ar 48.25807 \n",
" AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC\n",
"MOTIF:MA0002.2:RUNX1 584.14298 \n",
"MOTIF:MA0003.3:TFAP2A 680.10076 \n",
"MOTIF:MA0004.1:Arnt 394.62636 \n",
"MOTIF:MA0006.1:Ahr::Arnt 638.11923 \n",
"MOTIF:MA0007.3:Ar 56.37519 \n",
" AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC\n",
"MOTIF:MA0002.2:RUNX1 832.58595 \n",
"MOTIF:MA0003.3:TFAP2A 1234.67862 \n",
"MOTIF:MA0004.1:Arnt 689.08819 \n",
"MOTIF:MA0006.1:Ahr::Arnt 1053.81166 \n",
"MOTIF:MA0007.3:Ar 62.78027 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"colnames(df_out) = sapply(strsplit(colnames(df_out), \"\\\\.\"),'[',2)\n",
"dim(df_out)\n",
"df_out[1:5,1:5]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 519
\n",
"\t- 12178
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 519\n",
"\\item 12178\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 519\n",
"2. 12178\n",
"\n",
"\n"
],
"text/plain": [
"[1] 519 12178"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"if(! all(colnames(df_out) == rownames(metadata))){\n",
" df_out = df_out[,rownames(metadata)]\n",
" dim(df_out)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"saveRDS(df_out, file = '../../output/feature_matrices/FM_SCRAT_cusanovich2018subset_motifs.rds')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"R version 3.5.1 (2018-07-02)\n",
"Platform: x86_64-conda_cos6-linux-gnu (64-bit)\n",
"Running under: CentOS Linux 7 (Core)\n",
"\n",
"Matrix products: default\n",
"BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_SCRAT/lib/R/lib/libRblas.so\n",
"\n",
"locale:\n",
" [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C \n",
" [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 \n",
" [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 \n",
" [7] LC_PAPER=en_US.UTF-8 LC_NAME=C \n",
" [9] LC_ADDRESS=C LC_TELEPHONE=C \n",
"[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C \n",
"\n",
"attached base packages:\n",
"[1] stats4 parallel stats graphics grDevices utils datasets \n",
"[8] methods base \n",
"\n",
"other attached packages:\n",
" [1] SCRAT_0.99.0 SCRATdatahg19_0.99.1 \n",
" [3] GenomicAlignments_1.18.1 Rsamtools_1.34.0 \n",
" [5] Biostrings_2.50.2 XVector_0.22.0 \n",
" [7] SummarizedExperiment_1.12.0 DelayedArray_0.8.0 \n",
" [9] BiocParallel_1.16.6 matrixStats_0.54.0 \n",
"[11] Biobase_2.42.0 GenomicRanges_1.34.0 \n",
"[13] GenomeInfoDb_1.18.1 IRanges_2.16.0 \n",
"[15] S4Vectors_0.20.1 BiocGenerics_0.28.0 \n",
"[17] usethis_1.5.0 devtools_2.0.2 \n",
"\n",
"loaded via a namespace (and not attached):\n",
" [1] tsne_0.1-3 bitops_1.0-6 fs_1.2.7 \n",
" [4] RColorBrewer_1.1-2 rprojroot_1.3-2 repr_0.19.2 \n",
" [7] tools_3.5.1 backports_1.1.4 R6_2.4.0 \n",
"[10] DT_0.5 KernSmooth_2.23-15 lazyeval_0.2.2 \n",
"[13] colorspace_1.4-1 withr_2.1.2 tidyselect_0.2.5 \n",
"[16] prettyunits_1.0.2 processx_3.3.0 compiler_3.5.1 \n",
"[19] cli_1.1.0 desc_1.2.0 caTools_1.17.1.2 \n",
"[22] scales_1.0.0 callr_3.2.0 pbdZMQ_0.3-3 \n",
"[25] stringr_1.4.0 digest_0.6.18 shinyBS_0.61 \n",
"[28] scatterD3_0.9 dbscan_1.1-3 base64enc_0.1-3 \n",
"[31] pkgconfig_2.0.2 htmltools_0.3.6 sessioninfo_1.1.1 \n",
"[34] htmlwidgets_1.3 rlang_0.3.4 shiny_1.3.2 \n",
"[37] jsonlite_1.6 mclust_5.4.3 gtools_3.8.1 \n",
"[40] dplyr_0.8.0.1 RCurl_1.95-4.11 magrittr_1.5 \n",
"[43] GenomeInfoDbData_1.2.0 Matrix_1.2-17 Rcpp_1.0.1 \n",
"[46] IRkernel_0.8.15 munsell_0.5.0 stringi_1.4.3 \n",
"[49] zlibbioc_1.28.0 pkgbuild_1.0.3 gplots_3.0.1.1 \n",
"[52] plyr_1.8.4 grid_3.5.1 gdata_2.18.0 \n",
"[55] promises_1.0.1 crayon_1.3.4 lattice_0.20-38 \n",
"[58] IRdisplay_0.7.0 ps_1.3.0 pillar_1.3.1 \n",
"[61] uuid_0.1-2 reshape2_1.4.3 pkgload_1.0.2 \n",
"[64] glue_1.3.1 evaluate_0.13 remotes_2.0.4 \n",
"[67] httpuv_1.5.1 gtable_0.3.0 purrr_0.3.2 \n",
"[70] assertthat_0.2.1 ggplot2_3.1.1 mime_0.6 \n",
"[73] xtable_1.8-4 later_0.8.0 tibble_2.1.1 \n",
"[76] pheatmap_1.0.12 memoise_1.1.0 ellipse_0.4.1 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sessionInfo()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"save.image(file = 'SCRAT_cusanovich2018subset.RData')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "R [conda env:ATACseq_SCRAT]",
"language": "R",
"name": "conda-env-ATACseq_SCRAT-r"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}