##################################################################
### R Script for Pattern Searching in DNA or Protein Sequences ###
##################################################################
# Author: Thomas Girke (thomas.girke@ucr.edu), UC Riverside
# Last update: Nov 22, 2016
# Utility: 
#	Sequence import function
#	Reverse and complement function
#	Pattern searching function
#	Positional parsing
#	HTML and TEXT report of results

# How to run the script:
# 	source("https://bit.ly/37kIMdp")

##################################
## (1) Sequence Import Function ##
##################################
seq_imp_fct <- function(fileloc) {
	my_fasta <- readLines(fileloc) # reads file line-wise into vector 
	y <- regexpr("^[^>]", my_fasta, perl=T) # identifies all fields that do not start with a '>' sign 
	y <- as.vector(y);  y[y==-1] <- 0
	index <- which(y==0)
	distance <- data.frame(start=index[1:(length(index)-1)], end=index[2:length(index)])
	distance <- rbind(distance, c(distance[length(distance[,1]),2], length(y)+1)) # gets data for last entry 
	distance <- data.frame(distance, dist=distance[,2]-distance[,1])
	seq_no <- 1:length(y[y==0])
	index <- rep(seq_no, as.vector(distance[,3]))
	my_fasta <- data.frame(index, y, my_fasta)
	my_fasta[my_fasta[,2]==0,1] <- 0
	seq <- tapply(as.vector(my_fasta[,3]), factor(my_fasta[,1]), paste, collapse="", simplify=F)
	seq <- as.vector(seq[2:length(seq)])
	Desc <- as.vector(my_fasta[c(grep("^>", as.character(my_fasta[,3]), perl = TRUE)),3])
	ID <- gsub("^>| .*", "", as.character(Desc), perl=T)
	Desc <- gsub("^.*? ", "", as.character(Desc), perl=T)
	my_fasta <- data.frame(ID, Desc, Length=nchar(seq), seq)
	my_fasta
}	
cat("\n# (1.1) Import sequences into R like this: \n\t myseq <- seq_imp_fct(\"http://bioinfo.ucr.edu/~tgirke/Documents/R_BioCond/Samples/sampleDNA.txt\")\n")
cat("\n# (1.2) Run this command to set all sequences to upper case since the function gregexpr() is case sensitive! \n\t myseq$seq <- gsub(\"([A-Z])\", \"\\\\U\\\\1\", as.character(myseq$seq), perl=T, ignore.case=T)\n")
cat("\n# (1.3) To speed up the sequence loading process, save them as R object like this: \n\t save(myseq, file=\"myseq\")\n")   
cat("\n# (1.4) Next time reload the sequences with this command: \n\t load(file=\"myseq\")\n")

############################################
## (2) Reverse and/or Complement Function ##
############################################
rev_comp_fct <- function(seq=mystr, rev=T, comp=T) {
	if(rev==T) {
		seq <- as.vector(unlist(strsplit(seq, split="")))
		seq <- rev(seq)
		seq <- paste(seq, collapse="")
	}	
	if(comp==T) {
		seq <- gsub("A", "1", seq, ignore.case = T)
		seq <- gsub("T", "2", seq, ignore.case = T)
		seq <- gsub("C", "3", seq, ignore.case = T)
		seq <- gsub("G", "4", seq, ignore.case = T)		
		seq <- gsub("1", "T", seq, ignore.case = T)
		seq <- gsub("2", "A", seq, ignore.case = T)
		seq <- gsub("3", "G", seq, ignore.case = T)
		seq <- gsub("4", "C", seq, ignore.case = T)
	}
	seq
}
cat("\n# (2) The reverse and/or complement of a sequence can be obtained like this: \n\t rev_comp_fct(seq=\"TCGATCGTACGTTCAGCT\", rev=T, comp=T)\n")

###################################
## (3) Pattern Matching Function ##
###################################
# This function counts the matches of a user defined pattern (reg expr) in a sequence data frame generated by above 'seq_imp_fct'.
# In addition to the pattern counts, the function provides the positions of the pattern matches.

# Define pattern
cat("\n# (3.1) Define segmented pattern in upper case like this:\n\t p1 <- \"GGTTCCA\"; p2 <- \"ACCAG\"; mind <- 10; maxd <- 25 \n\t\t # p1: first pattern segment; p2: second pattern segment; mind: min distance; maxd: max distance\n")
			   
# Define pattern function
pattern_fct <- function(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) { # expects sequences in fourth column of 'sequence' data frame 
	if(segmented==T & revcomp==F) { 
		pattern <- paste(p1, ".{", mind, ",", maxd, "}", p2, sep="") 
		cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
	}
	if(segmented==T & revcomp==T) { 
		p1 <- rev_comp_fct(seq=p1, rev=T, comp=T)	
		p2 <- rev_comp_fct(seq=p2, rev=T, comp=T)	
		pattern <- paste(p2, ".{", mind, ",", maxd, "}", p1, sep="") 
		cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
	}
	if(segmented==F) { 
		cat("\n Your regular expression pattern looks like this: \n\t", pattern, "\n")
	}
	pos <- gregexpr(pattern, as.character(sequences$seq), perl=T) # 'gregexpr' returns list with pattern positions
	posS <- posS <- unlist(lapply(pos, function(x) paste(x, collapse=", "))) # retrieves positions where pattern matches in each sequence
	posS[posS==-1] <- 0
	if(genPos==T) {
		posE <- unlist(lapply(pos, function(x) paste((as.vector(unlist(x)) + as.vector(unlist(attributes(x))) - 1), collapse=", ")))
		posE[posS==0] <- 0
		hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
		posDF <- data.frame(ID=sequences[,1], hitsv, posS, posE, Length=sequences$Length)
		posDF <- posDF[posDF[,2]>=1,]
		posDF <- data.frame(
			ID=rep(posDF[,1], posDF[,2]),
			start=unlist(strsplit(as.character(posDF$posS), ", ")), 
			end=unlist(strsplit(as.character(posDF$posE), ", ")),
			count=rep(posDF[,2], posDF[,2]),
			seq_length=rep(posDF[,5], posDF[,2]))
		pos_frame <<- posDF
		cat("\n The object \"pos_frame\" has been generated.\n")
	}
	hitsv <- unlist(lapply(pos, function(x) if(x[1]==-1) { 0 } else { length(x) })) # counts the number of hits per sequence
	sequences <- data.frame(sequences[,1:3], Position=as.vector(posS), Hits=hitsv, sequences$seq)
	mypattern <<- pattern
	cat("\n Number of sequences with at least one match:", sum(sequences$Hits>=1), "\n Total number of matches:", sum(sequences$Hits),"\n\n")
	sequences
}
cat("\n# (3.2) Run pattern search: \n\t Qmyseq <- pattern_fct(pattern, sequences=myseq, segmented=T, revcomp=F, genPos=T) \n\t\t # The argument \"pattern\" allows to specify regular expression patterns directly when \"segmented=F\"\n")

#######################
## (4.1) HTML Report ##
#######################
html_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) {
	export <- Qmyseq[Qmyseq$Hits>=1,]
	if(all!=T) { export <- export[1:nhits,] }
	export_count <- length(export[,1])
	seqhtml <- gsub(paste("(", mypattern, ")", sep=""), "<font color=red>\\1</font>", as.character(export$seq), perl=T, ignore.case=T)
	seqhtml <- paste(seqhtml, rep("<br>", length(seqhtml)))
	export <- data.frame(export[,-6], seq=seqhtml)
	export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2],  sep=", "), seq=export[,6])
	export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq)
	export <- as.vector(unlist(t(export)))
	export <- c(
			"<html>", 
			"<head><title>Search Result</title></head>", 
			"<body>", 
			"<pre>", 
			paste("Pattern used for search:", mypattern), 
			paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), 
			paste("Total number of matches:", sum(Qmyseq$Hits)),
			paste("Number of sequences exported to this page:", export_count, "<br><p>"),
			export, 
			"</pre>", 
			"</body>", 
			"</html>")
	write.table(export, file=paste(myfile, ".html", sep=""), quote=F, row.names=F, col.names=F)
}
cat("\n# (4.1) Export results in HTML format: \n\t html_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n")

########################
## (4.2) FASTA Report ##
########################
fasta_report <- function(Qmyseq=Qmyseq, myfile="search", all=T, nhits) {
	export <- Qmyseq[Qmyseq$Hits>=1,]
	if(all!=T) { export <- export[1:nhits,] }
	export_count <- length(export[,1])
	export <- data.frame(Acc=paste(">", export[,1], sep=""), Stats=paste(paste("Seq_length:", export[,3]), paste(paste("No Hits:", export[,5]), "Hit Pos:", export[,4]), export[,2],  sep=", "), seq=export[,6])
	export <- data.frame(Acc=paste(export[,1], export[,2]), seq=export$seq)
	export <- as.vector(unlist(t(export)))
	export <- c(
			paste("Pattern used for search:", mypattern), 
			paste("Number of sequences with at least one match:", sum(Qmyseq$Hits>=1)), 
			paste("Total number of matches:", sum(Qmyseq$Hits)),
			paste("Number of sequences exported to this page:", export_count, "\n"),
			export)
	write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F)
}
cat("\n# (4.2) Export results in fasta format: \n\t fasta_report(Qmyseq=Qmyseq, myfile=\"search\", all=T, nhits) \n\t\t # Settings to export only 10 first hits: \"all=F, nhits=10\".\n")

###################################
## (5) Position Parsing Function ##
###################################

# This step requires a position data frame of the below format, which is automatically generated by the pattern_fct()
# function. The following example is for parsing ORFs from chromosomes. The ID numbers in the first colum 
# need to match the ones in myseq. Additional tracking columns can be appended at the end! Very long sequences like chromosomes
# require a lot of memory for parsing (~5GB per 100 million bps).
# ID  start end  count  seq_length 
# 1   500   521  7852   30432563   
# 2   500   521  4853   19705359   
# 3   500   521  6048   23470805   
# 4   500   521  4655   18585042   
# 5   500   521  7072   26992728   
# C   500   541  88     154478     
# M   500   541  122    366924     

# Define parse function
parsePos <- function(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame) {
        parsel <- as.list(as.character(myseql[,2]))
        names(parsel) <- myseql[,1]
        parsel <- parsel[unique(as.vector(parseIDs))]
        myids <- names(parsel)
        parsel <- lapply(parsel, strsplit, "")
        parsel <- lapply(parsel, unlist)
        exceedl <- apply(coor, 1, function(x) parsel[[x[1]]][as.numeric(x[2]):as.numeric(x[3])])
        exceed <- lapply(exceedl, function(x) sum(is.na(x)))
        if(sum(unlist(exceed))>0) {
                bad_coor <- which(as.vector(unlist(exceed))>0)
                cat("Warning: the coordinates of the following sequences are off, which caused NA insertions:\n", myids[bad_coor], "\n")
        }
        parsel <- as.list(apply(coor[,1:3], 1, function(x) paste(parsel[[as.vector(x[1])]][as.numeric(x[2]):as.numeric(x[3])],collapse="")))
        names(parsel) <- paste(coor[,1], "Seg", coor[,2], coor[,3], sep="_")
        parsel
}
cat("\n# (5.1) Parse coordinates:\n\t seql <- parsePos(myseql=myseq[,c(1,4)], parseIDs=pos_frame$ID, coor=pos_frame)\n")

# Function to add up- or down-stream to position data frame
getUpDown <- function(pos_frame=pos_frame, getUp=0, getDown=0) {
	pos_frame$start <- as.numeric(as.vector(pos_frame$start)) - getUp
	pos_frame$end <- as.numeric(as.vector(pos_frame$end)) + getDown
	pos_frame$start[pos_frame$start < 1] <- 1
	pos_frame$end[pos_frame$end > pos_frame$seq_length] <- pos_frame$seq_length[pos_frame$end > pos_frame$seq_length]
	pos_frame
}
cat("\n# (5.2) Up- or down-stream regions can be retrieved by providing length info in getDown/getUp arguments: \n\t pos_frame <- getUpDown(pos_frame=pos_frame, getUp=0, getDown=0)\n")

# Export sequence list to text file in fasta format
seql2fasta <- function(seql=seql, myfile="myfile") {
        names(seql) <- paste(">", names(seql), sep="")
        export <- data.frame(IDs=names(seql), seq=unlist(seql))
        export <- as.vector(unlist(t(export)))
        write.table(export, file=paste(myfile, ".txt", sep=""), quote=F, row.names=F, col.names=F)
}
cat("\n# (5.3) Export parse results in fasta format:\n\t seql2fasta(seql=seql, myfile=\"myfile\")\n")