################################################################## ### R Script for Pattern Searching in DNA or Protein Sequences ### ################################################################## # Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside # Last update: Nov 22, 2016 # Utility: # Sequence import function # Reverse and complement function # Pattern searching function # Positional parsing # HTML and TEXT report of results # How to run the script: # source("https://bit.ly/37kIMdp") ################################## ## (1) Sequence Import Function ## ################################## seq_imp_fct <- function(fileloc) { my_fasta <- readLines(fileloc) # reads file line-wise into vector y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign y <- as.vector(y); y[y==-1] <- 0 index <- which(y==0) distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)]) distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry distance <- data.frame(distance, dist=distance[,2]-distance[,1]) seq_no <- 1:length(y[y==0]) index <- rep(seq_no, as.vector(distance[,3])) my_fasta <- data.frame(index, y, my_fasta) my_fasta[my_fasta[,2]==0,1] <- 0 seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F) seq <- as.vector(seq[2:length(seq)]) Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3]) ID <- gsub("^>| .*", "", as.character(Desc), perl=T) Desc <- gsub("^.*? ", "", as.character(Desc), perl=T) my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq) my_fasta } cat("\n# (1.1) Import sequences into R like this: \n\t myseq <- seq_imp_fct(\"http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/Samples/sampleDNA.txt\")\n") cat("\n# (1.2) Run this command to set all sequences to upper case since the function gregexpr() is case sensitive! \n\t myseq$seq <- gsub(\"([A-Z])\", \"\\\\U\\\\1\", as.character(myseq$seq), perl=T, ignore.case=T)\n") cat("\n# (1.3) To speed up the sequence loading process, save them as R object like this: \n\t save(myseq, file=\"myseq\")\n") cat("\n# (1.4) Next time reload the sequences with this command: \n\t load(file=\"myseq\")\n") ############################################ ## (2) Reverse and/or Complement Function ## ############################################ rev_comp_fct <- function(seq=mystr, rev=T, comp=T) { if(rev==T) { seq <- as.vector(unlist(strsplit(seq, split=""))) seq <- rev(seq) seq <- paste(seq, collapse="") } if(comp==T) { seq <- gsub("A", "1", seq, ignore.case = T) seq <- gsub("T", "2", seq, ignore.case = T) seq <- gsub("C", "3", seq, ignore.case = T) seq <- gsub("G", "4", seq, ignore.case = T) seq <- gsub("1", "T", seq, ignore.case = T) seq <- gsub("2", "A", seq, ignore.case = T) seq <- gsub("3", "G", seq, ignore.case = T) seq <- gsub("4", "C", seq, ignore.case = T) } seq } cat("\n# (2) The reverse and/or complement of a sequence can be obtained like this: \n\t rev_comp_fct(seq=\"TCGATCGTACGTTCAGCT\", rev=T, comp=T)\n") ################################### ## (3) Pattern Matching Function ## ################################### # This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'. # In addition to the pattern counts, the function provides the positions of the pattern matches. # Define pattern cat("\n# (3.1) Define segmented pattern in upper case like this:\n\t p1 <- \"GGTTCCA\"; p2 <- \"ACCAG\"; mind <- 10; maxd <- 25 \n\t\t # p1: first pattern segment; p2: second pattern segment; mind: min distance; maxd: max distance\n") # Define pattern function pattern_fct <- function(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) { # expects sequences in fourth column of 'sequence' data frame if(segmented==T & revcomp==F) { pattern <- paste(p1, ".{", mind, ",", maxd, "}", p2, sep="") cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n") } if(segmented==T & revcomp==T) { p1 <- rev_comp_fct(seq=p1, rev=T, comp=T) p2 <- rev_comp_fct(seq=p2, rev=T, comp=T) pattern <- paste(p2, ".{", mind, ",", maxd, "}", p1, sep="") cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n") } if(segmented==F) { cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n") } pos <- gregexpr(pattern, as.character(sequences$seq), perl=T) # 'gregexpr' returns list with pattern positions posS <- posS <- unlist(lapply(pos, function(x) paste(x, collapse=", "))) # retrieves positions where pattern matches in each sequence posS[posS==-1] <- 0 if(genPos==T) { posE <- unlist(lapply(pos, function(x) paste((as.vector(unlist(x)) + as.vector(unlist(attributes(x))) - 1), collapse=", "))) posE[posS==0] <- 0 hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence posDF <- data.frame(ID=sequences[,1], hitsv, posS, posE, Length=sequences$Length) posDF <- posDF[posDF[,2]>=1,] posDF <- data.frame( ID=rep(posDF[,1], posDF[,2]), start=unlist(strsplit(as.character(posDF$posS), ", ")), end=unlist(strsplit(as.character(posDF$posE), ", ")), count=rep(posDF[,2], posDF[,2]), seq_length=rep(posDF[,5], posDF[,2])) pos_frame <<- posDF cat("\n The object \"pos_frame\" has been generated.\n") } hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence sequences <- data.frame(sequences[,1:3], Position=as.vector(posS), Hits=hitsv, sequences$seq) mypattern <<- pattern cat("\n Number of sequences with at least one match:", sum(sequences$Hits>=1), "\n Total number of matches:", sum(sequences$Hits),"\n\n") sequences } cat("\n# (3.2) Run pattern search: \n\t Qmyseq <- pattern_fct(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) \n\t\t # The argument \"pattern\" allows to specify regular expression patterns directly when \"segmented=F\"\n") ####################### ## (4.1) HTML Report ## ####################### html_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) { export <- Qmyseq[Qmyseq$Hits>=1,] if(all!=T) { export <- export[1:nhits,] } export_count <- length(export[,1]) seqhtml <- gsub(paste("(", mypattern, ")", sep=""), "\\1", as.character(export$seq), perl=T, ignore.case=T) seqhtml <- paste(seqhtml, rep("
", length(seqhtml))) export <- data.frame(export[,-6], seq=seqhtml) export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2], sep=", "), seq=export[,6]) export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq) export <- as.vector(unlist(t(export))) export <- c( "", "Search Result", "", "
", 
			paste("Pattern used for search:", mypattern), 
			paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), 
			paste("Total number of matches:", sum(Qmyseq$Hits)),
			paste("Number of sequences exported to this page:", export_count, "

"), export, "

", "", "") write.table(export, file=paste(myfile, ".html", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (4.1) Export results in HTML format: \n\t html_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n") ######################## ## (4.2) FASTA Report ## ######################## fasta_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) { export <- Qmyseq[Qmyseq$Hits>=1,] if(all!=T) { export <- export[1:nhits,] } export_count <- length(export[,1]) export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2], sep=", "), seq=export[,6]) export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq) export <- as.vector(unlist(t(export))) export <- c( paste("Pattern used for search:", mypattern), paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), paste("Total number of matches:", sum(Qmyseq$Hits)), paste("Number of sequences exported to this page:", export_count, "\n"), export) write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (4.2) Export results in fasta format: \n\t fasta_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n") ################################### ## (5) Position Parsing Function ## ################################### # This step requires a position data frame of the below format, which is automatically generated by the pattern_fct() # function. The following example is for parsing ORFs from chromosomes. The ID numbers in the first colum # need to match the ones in myseq. Additional tracking columns can be appended at the end! Very long sequences like chromosomes # require a lot of memory for parsing (~5GB per 100 million bps). # ID start end count seq_length # 1 500 521 7852 30432563 # 2 500 521 4853 19705359 # 3 500 521 6048 23470805 # 4 500 521 4655 18585042 # 5 500 521 7072 26992728 # C 500 541 88 154478 # M 500 541 122 366924 # Define parse function parsePos <- function(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame) { parsel <- as.list(as.character(myseql[,2])) names(parsel) <- myseql[,1] parsel <- parsel[unique(as.vector(parseIDs))] myids <- names(parsel) parsel <- lapply(parsel, strsplit, "") parsel <- lapply(parsel, unlist) exceedl <- apply(coor, 1, function(x) parsel[[x[1]]][as.numeric(x[2]):as.numeric(x[3])]) exceed <- lapply(exceedl, function(x) sum(is.na(x))) if(sum(unlist(exceed))>0) { bad_coor <- which(as.vector(unlist(exceed))>0) cat("Warning: the coordinates of the following sequences are off, which caused NA insertions:\n", myids[bad_coor], "\n") } parsel <- as.list(apply(coor[,1:3], 1, function(x) paste(parsel[[as.vector(x[1])]][as.numeric(x[2]):as.numeric(x[3])],collapse=""))) names(parsel) <- paste(coor[,1], "Seg", coor[,2], coor[,3], sep="_") parsel } cat("\n# (5.1) Parse coordinates:\n\t seql <- parsePos(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame)\n") # Function to add up- or down-stream to position data frame getUpDown <- function(pos_frame=pos_frame, getUp=0, getDown=0) { pos_frame$start <- as.numeric(as.vector(pos_frame$start)) - getUp pos_frame$end <- as.numeric(as.vector(pos_frame$end)) + getDown pos_frame$start[pos_frame$start < 1] <- 1 pos_frame$end[pos_frame$end > pos_frame$seq_length] <- pos_frame$seq_length[pos_frame$end > pos_frame$seq_length] pos_frame } cat("\n# (5.2) Up- or down-stream regions can be retrieved by providing length info in getDown/getUp arguments: \n\t pos_frame <- getUpDown(pos_frame=pos_frame, getUp=0, getDown=0)\n") # Export sequence list to text file in fasta format seql2fasta <- function(seql=seql, myfile="myfile") { names(seql) <- paste(">", names(seql), sep="") export <- data.frame(IDs=names(seql), seq=unlist(seql)) export <- as.vector(unlist(t(export))) write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F) } cat("\n# (5.3) Export parse results in fasta format:\n\t seql2fasta(seql=seql, myfile=\"myfile\")\n")