# Build and Reload Package: 'Cmd + Shift + B' Check Package: # 'Cmd + Shift + E' Test Package: 'Cmd + Shift + T' #' @title TailSeq Calculating Trunncation, Extension, and Tail of RNA Molecules #' #' @description Current packages that take care of sequence-based RNA or DNA sequence although very effective in many of applications, namely Biostrings; in our experience, they cannot capture the specific nuisances 3?\200\231 sequence analysis up to the details we had in our mind. To address this issue, we developed an R package called TailSeq. Please refer to the package manual for details. #' #' @param file_path, primerSeq, adaptorSeq, distPrimerFromEoG, tailNucleotide, tailThreshold #' #' @return dat_normal #' #' @export TailSeq <- function(file_path, primerSeq, adaptorSeq, distPrimerFromEoG, tailNucleotide, tailThreshold) { dat <- read.table(file_path, header = F, fill = T) colnames(dat) <- "OriginalSequence" dat$OriginalSequence <- as.character(dat$OriginalSequence) primerSeq <- primerSeq adaptorSeq <- adaptorSeq pbapply::pboptions(type = "timer", char = "=") cat("Calculate Primer Incidence and Locations") dat$NumberOfPrimer <- pbapply::pbsapply(dat$OriginalSequence, function(x) dim(as.data.frame(stringr::str_locate_all(x, primerSeq)))[1]) cat("Calculate Adaptor Incidence and Locations") dat$NumberOfAdaptor <- pbapply::pbsapply(dat$OriginalSequence, function(x) dim(as.data.frame(stringr::str_locate_all(x, adaptorSeq)))[1]) dat$NormalReads <- (dat$NumberOfPrimer == 1) == TRUE & (dat$NumberOfAdaptor == 1) == TRUE pdf(paste0(zonator::file_path_sans_ext(file_path), ".pdf", collapse = ""), onefile = T) pie(table(dat$NumberOfAdaptor), main = "Number of Adaptor per reads") pie(table(dat$NumberOfPrimer), main = "Number of Primer per read") pie(table(dat$NormalReads), labels = 2, main = "Number of Good Reads") dev.off() cat(c("Number of Adaptor per Read: ", table(dat$NumberOfAdaptor), "\n")) cat(c("Number of Primer per Read: ", table(dat$NumberOfPrimer), "\n")) cat(c("Number of Normal Reads: ", table(dat$NormalReads), "\n")) dat_normal <- dat[dat$NormalReads, ] cat("\n") cat(paste0(c("Number of Reads Excluded Due to Multiple Adaptor or Primer", dim(dat)[1] - dim(dat_normal)[1]), collapse = " : ")) dat_normal$end_of_primer <- pbapply::pbsapply(dat_normal$OriginalSequence, function(x) stringr::str_locate_all(x, primerSeq)[[1]][2]) dat_normal$start_of_adaptor <- pbapply::pbsapply(dat_normal$OriginalSequence, function(x) stringr::str_locate_all(x, paste0(adaptorSeq, ".*", collapse = ""))[[1]][1]) flank <- function(string, eOFp, sOFa) { stringr::str_sub(string = string, start = eOFp, end = sOFa) } dat_normal$flank <- base::mapply(flank, dat_normal$OriginalSequence, dat_normal$end_of_primer + 1, dat_normal$start_of_adaptor - 1) tailPattern <- paste0(c(rep(tailNucleotide, tailThreshold), ".*"), collapse = "") dat_normal$woTail <- gsub(tailPattern, "", dat_normal$flank) dat_normal$sizeOfTail <- nchar(dat_normal$flank) - nchar(dat_normal$woTail) dat_normal$sizeofExt <- nchar(dat_normal$woTail) - distPrimerFromEoG write.csv(dat_normal, paste0(file_path_sans_ext(file_path), "processed_TailSeqv2.csv", collapse = "")) write.csv(table(dat_normal$sizeofExt), paste0(file_path_sans_ext(file_path), "SizeOfExt_processed_TailSeqv2.csv", collapse = "")) write.csv(table(dat_normal$sizeOfTail), paste0(file_path_sans_ext(file_path), "SizeOfTail_processed_TailSeqv2.csv", collapse = "")) return(dat_normal) }