--- title: "" output: html_document: keep_md: yes number_sections: yes toc: yes --- ```{r setup echo=FALSE} knitr::opts_chunk$set(cache = TRUE) knitr::opts_knit$set(verbose = TRUE) # knitr::opts_chunk$set(dev=c('svg', 'png')) options(width = 100) ggplot2::theme_set(ggplot2::theme_bw()) ``` Metadata that varies from run to run: ```{r library_information_1} LIBRARY <- "" # For example, the sequencing run ID. ZENBU_COLLAB <- "" WORKFLOW <- "" # For example "OP-WORKFLOW-CAGEscan-short-reads-v2.0" MOIRAI_STAMP <- "" # For example "20150803121351" MOIRAI_PROJ <- "" # For example "Charles" MOIRAI_USER <- "" # For example "nano-fluidigm" ASSEMBLY <- "" # For example "hg38" BS_GENOME <- "" # For example "BSgenome.Hsapiens.UCSC.hg38" # # In case of a pair of Fludigm C1 CAGE runs, uncomment the lines below and fill # # insert missing information. Otherwise, delete or leave as it is. # RunA <- "" # RunB <- "" # ctrls <- list( RunA=list(posi="", nega="") # , RunB=list(posi="", nega="")) ``` Metadata inferred from the above: ```{r library_information_2} BASEDIR <- "/osc-fs_home/scratch/moirai" MOIRAI_RUN <- paste(LIBRARY, WORKFLOW, MOIRAI_STAMP, sep=".") MOIRAI_BASE <- file.path(BASEDIR, MOIRAI_USER, "project", MOIRAI_PROJ, MOIRAI_RUN) MOIRAI_FRAGS <- file.path(MOIRAI_BASE, "CAGEscan_fragments") MOIRAI_BAM <- file.path(MOIRAI_BASE, "genome_mapped") MOIRAI_URL <- paste0("http://moirai.gsc.riken.jp/", MOIRAI_BASE, "/", MOIRAI_RUN, ".html") ``` ============== - Sequenced on MiSeq (`r LIBRARY`) Jump directly to the [Analysis](#analysis) section if you are not interested in the details of the data processing. Data load and QC in R ===================== ```{r load_R_libraries, message=F} library("AnnotationHub") library(BS_GENOME, character.only = T) library("CAGEr") library("data.table") library("ggplot2") library("magrittr") library("oscR") library("plyr") library("smallCAGEqc") stopifnot( packageVersion("oscR") >= "0.2.0" , packageVersion("smallCAGEqc") >= "1" ) library("SummarizedExperiment") library("reshape") library("vegan") ``` MOIRAI metadata --------------- ```{r load_libs_from_moirai} libs <- loadMoiraiStats(processed_data = MOIRAI_BASE) # Pipe (%>%) to llPostProcess("nano-fluidigm") or # llPostProcess("C1 96") if appropriate; see help(llPostProcess). libs %>% summary ``` Intranet link: [`r MOIRAI_RUN`](`r MOIRAI_URL`) Transcript counts (properly paired) ----------------------------------- ```{r load-ctss-from-bed-files} pathsToInputFiles <- list.files(MOIRAI_FRAGS, "*.bed", full.names = TRUE) %>% Filter(f = function(file) file.size(file) > 0) samplenames <- basename(pathsToInputFiles) %>% sub(".bed", "", .) myCAGEset <- new( "CAGEset" , genomeName = BS_GENOME , inputFiles = pathsToInputFiles , inputFilesType = "bed" , sampleLabels = samplenames) getCTSS(myCAGEset) normalizeTagCount(myCAGEset, method = "simpleTpm") ``` Raw reads per molecule (BED12 data) ----------------------------------- ```{r load_fragments} bed <- loadBED12(pathsToInputFiles) levels(bed$library) <- samplenames ``` CAGEr functions --------------- Here are functions for making CTSS tables, that I hope can be merged in CAGEr. https://github.com/Bioconductor-mirror/CAGEr/pull/2 ```{r functions-for-making-CTSS-tables} CTSScoordinatesGR <- function (object){ ctssCoord <- object@CTSScoordinates ctssCoord <- GRanges( ctssCoord$chr , IRanges(ctssCoord$pos, ctssCoord$pos) , ctssCoord$strand) genome(ctssCoord) <- object@genomeName ctssCoord } CTSStagCountSE <- function (object){ colData <- data.frame( row.names = object@sampleLabels , samplename = object@sampleLabels , samplecolor = names(object@sampleLabels)) SummarizedExperiment( assays = as.matrix(object@tagCountMatrix) , rowData = CTSScoordinatesGR(object) , colData = colData) } ``` ```{r make-ctss-data-frame} l1 <- CTSStagCountSE(myCAGEset) colnames(l1) <- samplenames colData(l1) <- libs %>% DataFrame l1$counts <- colSums(assay(l1)) l1$l1 <- colSums(assay(l1) > 0) ``` Annotation with GENCODE ----------------------- ### Gene symbols ```{r expression-per-gene} ah <- AnnotationHub() query(ah, c("Gencode", "gff", "human")) ah["AH49556"] gff <- ah[["AH49556"]] gff rowRanges(l1)$genes <- ranges2genes(rowRanges(l1), gff) genes <- rowsum(assay(l1), rowRanges(l1)$genes) %>% data.frame l1$genes <- colSums(genes > 0) l1$geneSymbols <- countSymbols(genes) ``` ### Promoter, Exon, Intron, ... ```{r annotation} rowRanges(l1)$annotation <- ranges2annot(rowRanges(l1), gff) colData(l1) %<>% cbind(rowsum(assay(l1), rowRanges(l1)$annotation) %>% t %>% DataFrame) ``` Cell pictures ------------- ```{r load_fluo_data} # read.fluo <- function(RUN) read.delim( paste0("../imageJ/", RUN, ".txt") # , row.names="cell_id" # , stringsAsFactors = FALSE) # fluo <- rbind(read.fluo(RunA), read.fluo(RunB)) # libs <- cbind(libs, subset(fluo,,c("mean_ch2", "bg_mean_ch2", "mean_ch3", "bg_mean_ch3", "Error", "Comment"))) # libs$Error <- factor( libs$Error # , levels=c("0-No Error", "1-No cell", "2-Debris", "3-OutOfFocus", "4-MultipleCells", "5-Control", "6-Dead")) ``` A hardocoded treshold of 2.5 is used to identify dead cells. The histogram below is to check if this value makes sense in this dataset. ```{r dead-cells, dev=c('svg', 'png'), fig.height=2.5} # hist( libs$mean_ch3 - libs$bg_mean_ch3 # , br = 100 # , main = "Current threshold for identifying dead cells" # , xlab = "mean_ch3 - bg_mean_ch3" ) # deadThresh <- 2.5 # abline(v=deadThresh, col="red") # # libs[libs$mean_ch3 - libs$bg_mean_ch3 > deadThresh, "Error"] <- "6-Dead" ``` Controls -------- Some samples with errors were repalced by the positive and negative controls. ```{r flag-controls} l1$Error[l1$Well %in% ctrls[["RunA"]] & l1$Run == RunA] <- "5-Control" l1$Error[l1$Well %in% ctrls[["RunB"]] & l1$Run == RunB] <- "5-Control" l1$Comment[l1$Well == ctrls$RunA$posi & l1$Run == RunA] <- "Positive control" l1$Comment[l1$Well == ctrls$RunB$posi & l1$Run == RunB] <- "Positive control" l1$Comment[l1$Well == ctrls$RunA$nega & l1$Run == RunA] <- "Negative control" l1$Comment[l1$Well == ctrls$RunB$nega & l1$Run == RunB] <- "Negative control" ``` cDNA concentration. ------------------- ```{r cDNA_concentration, dev=c('svg', 'png'), fig.height=2.5, message=FALSE, warning=FALSE} read.pg <- function(RUN) paste0("../cDNA_yields/", RUN, ".picogreen.xlsx") %>% fldgmPicoGreen("PN 100-6160") %>% extract(,"concentration") l1$Concentration <- c(read.pg(RunA), read.pg(RunB)) fldgmConcentrationPlot(colData(l1)) ``` Combined analysis of fluorescence and cDNA concentration. --------------------------------------------------------- ### Array heatmaps. ```{r define_fldgmArrayQCplot} # fldgmArrayQCplot <- function(RUN) fldgmArrayQC(libs[libs$Run==RUN,], RUN) ``` ```{r 'runA.arrayQC', dev=c('svg', 'png'), fig.height=2.5} # fldgmArrayQCplot(RunA) ``` ```{r 'runB.arrayQC', dev=c('svg', 'png'), fig.height=2.5} # fldgmArrayQCplot(RunB) ``` ### Live / dead stain and DNA concentration. Both runs are plotted together; this explains why there may be two groups, when cDNA yields differed strongly. ```{r live-dead, dev=c('svg', 'png')} # with(libs, plotNameColor(Concentration, mean_ch2 - bg_mean_ch2, Error, Well)) # with(libs, plotNameColor(Concentration, mean_ch3 - bg_mean_ch3, Error, Well)) ``` Richness -------- ```{r calculate-richness} l1$r100l1 <- rarefy(t(assay(l1)),100) l1$r100l1[l1$counts < 100] <- NA ``` Analysis ======== ```{r richness-concentration, dev=c('svg', 'png')} # with(libs, plotNameColor(Concentration, r100l1, Error, Well)) ``` ```{r richness-outliers, dev=c('svg', 'png')} # with(subset(libs, Error == "0-No Error"), plotNameColor(Concentration, r100l1, Run, Well)) ``` QC barplots ----------- ```{r qc-barplots, dev=c('svg', 'png')} plotAnnot(colData(l1), 'qc', LIBRARY, l1$Group) ``` Annotation ---------- ```{r annotation-barplots, dev=c('svg', 'png')} plotAnnot(colData(l1), 'counts', LIBRARY, l1$Group) ``` Correlation between runs ------------------------ ```{r correlation-heatmap, dev=c('svg', 'png')} NMF::aheatmap( cor(genes[-1, ]) , annCol=list(Run=l1$Run)) ``` ```{r correlation-heatmap-noerrors, dev=c('svg', 'png')} # Uncomment if you have a Error column #NMF::aheatmap( cor(genes[-1, libs$Error == "0-No Error"]) # , annCol=list(Run=libs[libs$Error == "0-No Error", "Run"])) ``` Gene counts and TSS discovery ----------------------------- ### Gene count by error code. ```{r gene-count, dev=c('svg', 'png'), fig.height=2.5} dotsize <- 50 ggplot(colData(l1) %>% data.frame, aes(x=Group, y=genes)) + stat_summary(fun.y=mean, fun.ymin=mean, fun.ymax=mean, geom="crossbar", color="gray") + geom_dotplot(aes(fill=Group), binaxis='y', binwidth=1, dotsize=dotsize, stackdir='center') + coord_flip() + facet_wrap(~Run) ``` ### Gene count per transcript count. ```{r gene-counts-run-plot, dev=c('svg', 'png')} with(libs, plotNameColor(genes, counts, Run, Well)) ``` ### Gene counts per C1 run. ```{r gene-counts-run-boxplot, dev=c('svg', 'png')} #boxplot(data=subset(libs, Error == "0-No Error"), genes ~ Run, ylab="Number of genes detexted (aprox)", main="Comparison of gene detection by run.") #t.test(data=subset(libs, Error == "0-No Error"), genes ~ Run) ``` Rarefaction (hanabi plot). -------------------------- The purpose of these plots is to ease the comparison of the libraries given that they do not have exactly the same sequencing depth, and to evaluate if extra sequencing depth would be useful. The data is rarefied with enough sampling points to give a smooth appearance to the curves. If it takes too much time, the output can be stored in an object saved on the hard drive. This way, if the commands have been run through knitr, the long computations here can be skipped if needed again in an interactive session. For example: ``` rar1 <- hanabi(assay(l1), from = 0) saveRDS(rar1, "rar1.rds") # And later... rar1 <- readRDS("rar1.rds") ``` ### Transcript discovery (with raw reads) Would we detect more transcripts if we sequenced more raw reads ? ````{r hanabi-UMI, dev=c('svg', 'png')} hanabiPlot( tapply(bed$score, bed$library, hanabi, from = 0) %>% structure(class = "hanabi") , ylab = 'number of molecules detected' , xlab = 'number of properly mapped reads' , main = paste("Transcript discovery for", LIBRARY) , GROUP = bed$library %>% levels %>% sub("_.*", "", .)) ``` ### Gene discovery Would we detect more genes if we detected more transcripts ? ```{r hanabi-gene, dev=c('svg', 'png')} hanabiPlot( hanabi(genes, from = 0) , ylab = 'number of genes detected' , xlab = 'number of unique molecule counts' , main = paste("Gene discovery for", LIBRARY) , GROUP = colData(l1)$group) ``` ### TSS discovery Would we detect more TSS if we detected more transcripts ? ```{r hanabi-TSS, dev=c('svg', 'png')} hanabiPlot( hanabi(assay(l1), from = 0) , ylab = 'number of TSS detected' , xlab = 'number of unique molecule counts' , main = paste("TSS discovery for", LIBRARY) , GROUP = colData(l1)$group) ``` ### Comparison between the runs (only "No Error" cells) ```{r hanabi-runs, dev=c('svg', 'png')} #hanabiPlot(rarg[libs$Error=="0-No Error",], sampleSizeCounts, ylab='number of genes detected', xlab='number of unique molecule counts', main=paste("Gene discovery for", LIBRARY), GROUP=libs[libs$Error=="0-No Error", "Run"]) #legend('topleft',legend=levels(libs[libs$Error=="0-No Error", "Run"]), col=1:length(levels(libs[libs$Error=="0-No Error", "Run"])), pch=1) ``` Note that the horizontal axis is not the same in every plot: - For _TSS_ or _Gene discovery_, the horizontal axis is unique molecule counts, that is, transcripts. The purpose of these plots is to show how easy it would be to detect more TSS or genes if we would sequence more transcripts. - For _Transcript discovery_, the horizontal axis is number of raw reads. The purpose of this plot is to show how possible it is to sequence more transcripts by increasing the number of raw reads. Data load on Zenbu. =================== FIXME: will do later FIXME: make idempotent Accessory functions ------------------- Uploading to Zenbu collaboration `r ZENBU_COLLAB`. Ad-hoc wrapper to the shell command `zenbu_upload` ```{r define_zenbuUpload} zenbuUpload <- function ( ... , URL="http://fantom.gsc.riken.jp/zenbu" , verbose=FALSE , echo=FALSE , stdout=TRUE) { zenbu <- 'zenbu_upload' url <- c('-url', URL) args <- sapply(c(url, ...), shQuote) if (verbose == TRUE) print(paste(c(zenbu, args), collapse=' ')) if (echo == FALSE) { system2(zenbu, args, stdout=stdout) } else { system2('echo', c(zenbu, args), stdout=stdout) } } ``` Helper functions ```{r define_helperFunctions} bedToSample <- function(BED) BED %>% sub("RunA", RunA, .) %>% sub("RunB", RunB, .) %>% sub(".bed", "", .) # To produce a string of keywords that will uniquely identify a sample. # TODO: move keywords to the top. sampleDescription <- function(BED) paste( MOIRAI_FRAGS , bedToSample(BED) , "what kind of data" # For innstance "CAGEscan_fragments" , "a keyword for easy retrieval" # For instance "KnitrUpload" # add more keywords as needed ) CSfragmentLocation <- function(BED) paste0(MOIRAI_FRAGS, "/", BED, ".bed") CSuploadCommand <- function(BED) paste( CSfragmentLocation(BED) , bedToSample(BED) , sampleDescription(BED) , "" , sep = "\t") ``` ```{r createUploadFile} levels(bed$library) %>% CSuploadCommand %>% writeLines(paste0(LIBRARY, ".zUpload.tsv")) ``` To upload CAGEscan fragments, only if they have not yet been uploaded. ```{r zenbuFileListUpload} zenbuUpload( "-assembly", ASSEMBLY , "-filelist", paste0(LIBRARY, ".zUpload.tsv") , "-collab_uuid", ZENBU_COLLAB , "-singletag_exp") ``` Functions to add metadata tags. ```{r tag-in-zenbu} zenbuTag <- function (filter, key, value, mode='add', ...) { args <- c('-mdedit', filter, mode, key, value) zenbuUpload (args, ...) } tagError <- function(BED) zenbuTag( sampleDescription(BED) , 'cellomics' , libs[bedToSample(BED), "Error"] %>% as.character) tagMeta <- function(BED, TAG) zenbuTag( sampleDescription(BED) , TAG , libs[bedToSample(BED), TAG] %>% as.character) tagComment <- function (Sample, comment, ...) { filter <- paste(MOIRAI_FRAGS, Sample, "C1_CAGEscan_fragments", "KnitrUpload") zenbuTag(filter, 'cellomics_comment', comment, ...) } #apply(libs[,c('Well', 'Error')], 1, function(X) tagError (X[1], X[2])) #apply(libs[,c('Well', 'Comment')], 1, function(X) tagComment(X[1], X[2])) #keywords need to be stringent: "D12"" matches BED12 !!! ``` ### Zenbu uploads and tagging ```{r upload-and-tag} samplesToUpload <- subset(libs, properpairs > 0, "samplename", drop=T) %>% sub(RunA, "RunA", .) %>% sub(RunB, "RunB", .) %>% sub("$", ".bed", .) sapply(samplesToUpload, tagError) sapply(samplesToUpload, tagMeta, TAG="Group") sapply(samplesToUpload, tagMeta, TAG="Run") sapply(samplesToUpload, tagMeta, TAG="row") sapply(samplesToUpload, tagMeta, TAG="column") ```