#!/lab/64/bin/python # preProcess_getASEventReadCounts.py # Author: Angela Brooks # Program Completion Date: # Modification Date(s): # Copyright (c) 2011, Angela Brooks. anbrooks@gmail.com # All rights reserved. """Takes a SAM file as input and will create three files that are used as input to getASEventReadCounts.py - *_junctions.bed - Junctions in BED format - *_genome_reads.txt - Text file containing genome reads. Reads are assumed to be the same length. Format is query_name, chr, start, end Additional files if paired-end reads exist: - *_paired_end_junctions2qname.txt - File containing the start and end of introns that are inferred from junctions. Only from paired-end reads. - *_paired_end_as_single_genome.txt - Text file containing genome reads that are from paired-end reads. """ # *_intron_exon_junctions.txt - Text file that is output from # intron_exon_jctn_counts.py. Used for quantifying intron retention, # alternative donor, and alternative acceptor events. import sys import optparse import os import pdb import math import pysam import gzip ############# # CONSTANTS # ############# #IE_SCRIPT = "/h/angela/bin/intron_exon_jctn_counts.py" DEF_NAME = "getASEventReadCounts_input" K_RGB_STR = "0,0,0" N_RGB_STR = "0,60,120" # Corresponds to reads equally split between 2 different offsets. DEF_CONFIDENCE = 1.0 # Some strange junction alignments occur towards the beginning of the chromosome # which gives errors later. These are likely artifacts, anyway. Junction # alignments at the beginning of the chromosome must have at least this number # of bases in the resulting alignments DEF_BEG_JCN_OVERHANG = 25 SHELL = "/bin/tcsh" MAX_CHAR = 60 CIGAR_OPER = {0:"M", 1:"I", 2:"D", 3:"N", 4:"S", 5:"H", 6:"P", 7:"=", 8:"X"} #UMAN_GRCh37_2_UCSC_hg19_chr = {"1":"chr1", # "2":"chr2", # "3":"chr3", # "4":"chr4", # "5":"chr5", # "6":"chr6", # "7":"chr7", # "8":"chr8", # "9":"chr9", # "10":"chr10", # "11":"chr11", # "12":"chr12", # "13":"chr13", # "14":"chr14", # "15":"chr15", # "16":"chr16", # "17":"chr17", # "18":"chr18", # "19":"chr19", # "20":"chr20", # "21":"chr21", # "22":"chr22", # "X":"chrX", # "Y":"chrY", # "MT":"chrY", # "GL000207.1":"chrUn_gl000207", ################# # END CONSTANTS # ################# ########### # CLASSES # ########### class OptionParser(optparse.OptionParser): """ Adding a method for required arguments. Taken from: http://www.python.org/doc/2.3/lib/optparse-extending-examples.html """ def check_required(self, opt): option = self.get_option(opt) # Assumes the option's 'default' is set to None! if getattr(self.values, option.dest) is None: print "%s option not supplied" % option self.print_help() sys.exit(1) class JcnInfo: """ This object holds all information necessary for junction information The junctions will be in BED format """ def __init__(self, name, chr, chromStart, chromEnd, strand, first_block, second_block, intron_start, intron_end, multiJcnBlock=None): self.name = name self.chr = chr if not chr.startswith("chr"): self.chr = "chr" + chr self.leftmost_start = chromStart self.rightmost_end = chromEnd total_length = chromEnd - chromStart self.second_block_start = total_length - second_block self.longest_first_block = first_block self.longest_second_block = second_block self.strand = strand self.block_list = [first_block] # Used for calculating entropy when reads span multiple jcns. if multiJcnBlock: self.multiJcnBlock_list = [multiJcnBlock] else: self.multiJcnBlock_list = [] self.intron_start = intron_start self.intron_end = intron_end def updateJcnInfo(self, name, chr, chromStart, chromEnd, strand, first_block, second_block, intron_start, intron_end, multiJcnBlock=None): # Check that name, chromosome, strand are the same if not chr.startswith("chr"): chr = "chr" + chr if self.name != name: print "Not the same name of the junctions: %s, %s" % (self.name, name) sys.exit(1) if self.chr != chr: print "Error with chromosome: %s, %s" % (self.name, name) sys.exit(1) if self.strand != strand: print "Warning: Junction reads aligning to both strands: %s. Setting strand=." % self.name self.strand = "." if self.intron_start != intron_start: print "Error with intron start: %d, %d" % (self.intron_start, intron_start) sys.exit(1) if self.intron_end != intron_end: print "Error with intron end: %d, %d" % (self.intron_end, intron_end) sys.exit(1) # Check blocks start if (self.rightmost_end - self.longest_second_block) != (chromEnd - second_block): print "Error with second block in %s, %s" % (self.name, name) sys.exit(1) if (self.leftmost_start + self.longest_first_block) != (chromStart + first_block): print "Error with first block %s, %s" % (self.name, name) sys.exit(1) if chromStart < self.leftmost_start: self.leftmost_start = chromStart self.longest_first_block = first_block # Change second block start total_length = self.rightmost_end - self.leftmost_start self.second_block_start = total_length - self.longest_second_block if chromEnd > self.rightmost_end: self.rightmost_end = chromEnd self.longest_second_block = second_block # Change second block start total_length = self.rightmost_end - self.leftmost_start self.second_block_start = total_length - self.longest_second_block self.block_list.append(first_block) if multiJcnBlock: self.multiJcnBlock_list.append(multiJcnBlock) ############### # END CLASSES # ############### ######## # MAIN # ######## def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-s", dest="sam_file", type="string", help="""SAM/BAM file of read alignments to junctions and the genome. More than one file can be listed, but comma-delimited, e.g file_1.bam,file_2.bam""", default=None) opt_parser.add_option("--unique", dest="unique_only", action="store_true", help="""Only keeps uniquely aligned reads. Looks at NH tag to be 1 for this information.""", default=False) opt_parser.add_option("-n", dest="name", type="string", help="""Name prefixed to output files and used for output BED file. Default=%s""" % DEF_NAME, default=DEF_NAME) opt_parser.add_option("-l", dest="read_length", type="int", help="""Expected read length if all reads should be of the same length""", default=None) opt_parser.add_option("-c", dest="confidence_score", type="float", help="""The mininmum entropy score a junction has to have in order to be considered confident. The entropy score = -Shannon Entropy. Default=%s""" % DEF_CONFIDENCE, default=DEF_CONFIDENCE) opt_parser.add_option("-j", dest="forced_junctions", type="string", help="""File containing intron coordinates that correspond to junctions that will be kept regardless of the confidence score.""", default=None) opt_parser.add_option("-o", dest="output_dir", type="string", help="Directory to place all output files.", default=None) # opt_parser.add_option("-p", # dest="paired_end_exists", # action="store_true", # help="Flag to indicate there are paired-end reads.", # default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-s") opt_parser.check_required("-o") sam_files = [] sam_file_names = options.sam_file.split(",") for sam_file in sam_file_names: if sam_file.endswith(".sam"): sam_files.append(open(sam_file)) elif sam_file.endswith(".bam"): sam_files.append(pysam.Samfile(sam_file, "rb")) # sam_file = gzip.open(options.sam_file) else: print "Error in -s: Expecting .sam or .bam file." opt_parser.print_help() sys.exit(1) unique_only = options.unique_only name = options.name read_len = options.read_length confidence_score = options.confidence_score out_dir = options.output_dir # paired_end_exists = options.paired_end_exists # Still working on the paired end version paired_end_exists = False # Create ouptut directory if it doesn't exist. if not os.path.exists(out_dir): os.mkdir(out_dir) out_dir = os.path.abspath(out_dir) print "Creating output directory: %s" % out_dir else: out_dir = os.path.abspath(out_dir) if not out_dir.endswith("/"): out_dir += "/" forced_jcns = None if options.forced_junctions: forced_jcns = getForcedJunctions(options.forced_junctions) # Creating output files. junction_bed_file_name = "%s%s_junctions.bed" % (out_dir, name) junction_bed_file = open(junction_bed_file_name, "w") genome_file_name = "%s%s_genome_reads.txt.gz" % (out_dir, name) genome_file = gzip.open(genome_file_name, "wb") # Paired-end reads exist. if paired_end_exists: jcn2qname_file_name = "%s%s_paired_end_junctions2qname.txt" % (out_dir, name) jcn2qname_file = open(jcn2qname_file_name, "w") paired_end_genome_file_name = "%s%s_paired_end_as_single_genome.txt" % (out_dir, name) paired_end_genome_file = open(paired_end_genome_file_name, "w") # Creating names of files for intron/exon junction counts. # ie_coord_filename = "%s%s_intron_exon_junction_coords.out" % (out_dir, name) # ie_read_assoc_filename = "%s%s_intron_exon_junction_coords_w_read.out" % (out_dir, name) # ie_read_count_filename = "%s%s_intron_exon_junction_counts.txt" % (out_dir, name) # Dictionary to find JcnInfo to update as junction reads are encountered jcn2JcnInfo = {} jcn2pairedReads = {} deletionFlag = False insertionFlag = False softclipFlag = False hardclipFlag = False truncation_warn = False print "Parsing SAM file" for sam_file in sam_files: for line in sam_file: if options.sam_file.endswith(".bam"): # I realize this is bad style, but the original code was written # for parsing SAM files. So less updating is necessary if I convert # the AlignedRead object back to a SAM line line = convert2SAMLine(sam_file, line) line = formatLine(line) # Ignore headers if line.startswith("@"): continue sam_elems = line.split("\t") if len(sam_elems) < 11: print "Error in SAM file: Expecting at least 11 columns in SAM file." sys.exit(1) q_name = sam_elems[0] flag = int(sam_elems[1]) chr = sam_elems[2] # Ignore unmapped if chr == "*": continue if not chr.startswith("chr"): chr = "chr" + chr chr_start = int(sam_elems[3]) cigar = sam_elems[5] # Ignore unmapped if cigar == "*": continue tags = sam_elems[11:] if unique_only: isMultiFlag = isMultiMapped(tags) if isMultiFlag is None: print "Could not find tag for %s" % q_name continue if isMultiFlag: continue m_count = cigar.count("M") i_count = cigar.count("I") if i_count > 0: if not insertionFlag: print "Not supporting insertions, yet e.g., %s" % cigar insertionFlag = True continue d_count = cigar.count("D") if d_count > 0: if not deletionFlag: print "Not supporting deletions, yet e.g., %s" % cigar deletionFlag = True continue s_count = cigar.count("S") if s_count > 0: if not softclipFlag: print "Not supporting softclipping, yet e.g., %s" % cigar softclipFlag = True continue h_count = cigar.count("H") if h_count > 0: if not hardclipFlag: print "Not supporting hardclipping, yet e.g., %s" % cigar hardclipFlag = True continue # Check if it is a genome read if m_count == 1: # A GENOME READ this_read_len = int(cigar.rstrip("M")) if read_len: if this_read_len != read_len: print "Expecting reads of length: %d" % read_len print line sys.exit(1) chr_end = chr_start + this_read_len - 1 genome_line = "%s\t%s\t%d\t%d\n" % (q_name, chr, chr_start, chr_end) if paired_end_exists: if isPairedRead(flag): paired_end_genome_file.write(genome_line) else: genome_file.write(genome_line) else: genome_file.write(genome_line) else: # A JUNCTION READ n_count = cigar.count("N") if n_count == 0: print "Expecting a junction read: %s" % cigar continue n_split = cigar.split("N") # Get the downstrm length downstr_len = int(n_split.pop().rstrip("M")) # A list to hold the information about each intron. Used in cases # where a read aligns to multiple junctions. introns_info = [] first_chr_start = chr_start # Get first intron information which also will be used for the # upstream length upstr_len, intron_len = map(int, n_split.pop(0).split("M")) introns_info.append((chr_start, intron_len, None)) # Updating the chr_start chr_start = chr_start + upstr_len + intron_len # Get remaining information from additional introns if there are # any # 3rd element is used in calculating entropy for remaining_intron in n_split: exon_len, intron_len = map(int, remaining_intron.split("M")) introns_info.append((chr_start + exon_len - upstr_len, intron_len, chr_start - first_chr_start)) chr_start = chr_start + exon_len + intron_len jcn_tag = None jcn_strand = None for tag in tags: if tag.startswith("Y0"): jcn_tag = tag[5:] if len(jcn_tag) > MAX_CHAR: jcn_tag = jcn_tag[:MAX_CHAR] if not truncation_warn: truncation_warn = True print "Warning: Y0 tags truncated to %d characters." % MAX_CHAR if tag.startswith("XS"): tag, almost_strand = tag.split("A") jcn_strand = almost_strand.lstrip(":") if jcn_strand != "+" and jcn_strand != "-" and jcn_strand != ".": print "Error in strand information for: %s" % line sys.exit(1) # Now insert all introns into jcn dictionary. for intron_info in introns_info: chr_start = intron_info[0] intron_len = intron_info[1] total_len = upstr_len + intron_len + downstr_len jcn_str = None if not jcn_tag: # Create a junction string based on the 1-based junction # coordinate jcn_str = "%s:%d-%d" % (chr, chr_start + upstr_len, chr_start + upstr_len + intron_len - 1) else: # Need to make multiple jcn_str for each intron jcn_str = "%s|%s:%d-%d" % (jcn_tag, chr, chr_start + upstr_len, chr_start + upstr_len + intron_len - 1) # Check for odd junctions that are aligned toward the # beginning of the chromosome, which causes problems later if (chr_start + upstr_len) < DEF_BEG_JCN_OVERHANG: continue if not jcn_strand: jcn_strand = "." # Get BED format information chromStart = chr_start - 1 chromEnd = chromStart + total_len # Now add junction to dictionary if jcn_str in jcn2JcnInfo: jcn2JcnInfo[jcn_str].updateJcnInfo(jcn_str, chr, chromStart, chromEnd, jcn_strand, upstr_len, downstr_len, chr_start + upstr_len, chr_start + upstr_len + intron_len - 1, intron_info[2]) else: jcn2JcnInfo[jcn_str] = JcnInfo(jcn_str, chr, chromStart, chromEnd, jcn_strand, upstr_len, downstr_len, chr_start + upstr_len, chr_start + upstr_len + intron_len - 1, intron_info[2]) # Add read to paired-end dictionary: if paired_end_exists: if isPairedRead(flag): jcn_str = "%s:%d-%d" % (chr, chr_start + upstr_len, chr_start + upstr_len + intron_len - 1) if jcn_str in jcn2pairedReads: jcn2pairedReads[jcn_str].append(q_name) else: jcn2pairedReads[jcn_str] = [q_name] # Close the genome file: genome_file.close() if paired_end_exists: paired_end_genome_file.close() # Done processing all SAM lines, now make junction BED file print "Making Junction BED File" bed_header = "track name=\"%s_jcn_counts\" itemRgb=\"On\" useScore=1\n" % name junction_bed_file.write(bed_header) confident_jcns = set([]) for jcn_str in jcn2JcnInfo: if isConfidentJunction(jcn2JcnInfo[jcn_str].block_list, jcn2JcnInfo[jcn_str].multiJcnBlock_list, confidence_score, jcn2JcnInfo[jcn_str].chr, jcn2JcnInfo[jcn_str].intron_start, jcn2JcnInfo[jcn_str].intron_end, forced_jcns): confident_jcns.add("%s:%d-%d" % (jcn2JcnInfo[jcn_str].chr, jcn2JcnInfo[jcn_str].intron_start, jcn2JcnInfo[jcn_str].intron_end)) # If junction name has ":::", then the type of junction is known if ":::" in jcn_str: rgb_str = getRGB(jcn_str) else: rgb_str = K_RGB_STR bed_line = "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t2\t%s\t%s\n" % (jcn2JcnInfo[jcn_str].chr, jcn2JcnInfo[jcn_str].leftmost_start, jcn2JcnInfo[jcn_str].rightmost_end, jcn_str, len(jcn2JcnInfo[jcn_str].block_list), jcn2JcnInfo[jcn_str].strand, jcn2JcnInfo[jcn_str].leftmost_start, jcn2JcnInfo[jcn_str].rightmost_end, rgb_str, ",".join([repr(jcn2JcnInfo[jcn_str].longest_first_block), repr(jcn2JcnInfo[jcn_str].longest_second_block)]), ",".join(["0",repr(jcn2JcnInfo[jcn_str].second_block_start)])) junction_bed_file.write(bed_line) junction_bed_file.close() # Print out junction to read file for paired end reads if paired_end_exists: for jcn_str in jcn2pairedReads: if jcn_str in confident_jcns: outline = "%s\t%s\n" % (jcn_str, ",".join(jcn2pairedReads[jcn_str])) jcn2qname_file.write(outline) jcn2qname_file.close() sys.exit(0) ############ # END_MAIN # ############ ############# # FUNCTIONS # ############# def convert2CIGAR(cigar_tuple): cigar_str = "" for tup in cigar_tuple: cigar_str += "%d%s" % (tup[1], CIGAR_OPER[tup[0]]) return cigar_str def convert2SAMLine(bam_file, read_obj): # Checks due to depracation of variables in pysam try: rnext = read_obj.rnext pnext = read_obj.pnext tlen = read_obj.tlen except: rnext = read_obj.mrnm pnext = read_obj.mpos tlen = read_obj.isize try: rname = bam_file.getrname(read_obj.tid) except: rname = None samline = "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s" % (read_obj.qname, read_obj.flag, rname if rname else "*", read_obj.pos + 1, read_obj.mapq, convert2CIGAR(read_obj.cigar) if read_obj.cigar else "*", rnext if rnext else "*", pnext + 1 if pnext else 0, tlen if tlen else 0, read_obj.seq, read_obj.qual) if read_obj.tags: if read_obj.tags == []: return samline + "\n" tags = convert2Tags(read_obj.tags) if tags != "": samline += "\t%s" % tags return samline + "\n" def convert2Tags(tag_tuples): tag_list = [] for tag_tup in tag_tuples: if tag_tup[0] == "Y0": tag_list.append("Y0:Z:%s" % tag_tup[1]) if tag_tup[0] == "XS": tag_list.append("XS:A:%s" % tag_tup[1]) if tag_tup[0] == "NH": tag_list.append("NH:i:%d" % tag_tup[1]) return "\t".join(tag_list) def convertFlag(flag): # Bitwise operation to get strand information if flag & 16: return "-" else: return "+" def formatLine(line): line = line.replace("\r","") line = line.replace("\n","") return line def getForcedJunctions(forced_junction_file): """ File has chr start end for introns that would like to be kept in analysis. set will be "chr_start_end" """ jcn_file = open(forced_junction_file) forced_junctions = set([]) for line in jcn_file: line = formatLine(line) lineList = line.split("\t") # Slicing list in case strand is included in the file if len(lineList) < 3: print "Problem with forced junction file. Needs to be tab-delimited: chr start end" sys.exit(1) if not lineList[0].startswith("chr"): lineList[0] = "chr" + lineList[0] forced_junctions.add("_".join(lineList[0:3])) jcn_file.close() return forced_junctions def getRGB(jcn_str): elems = jcn_str.split(":::") for elem in elems: if elem == "A": return K_RGB_STR return N_RGB_STR def getPos2Count(blocklist): pos2count = {} for block in blocklist: if block in pos2count: pos2count[block] += 1 else: pos2count[block] = 1 return pos2count def getShannonIndex(pos2countDict, totalCount): """ Calculates a Shannon-Wiener Index for the positions and counts. It treats each position like a unique species. """ if totalCount == 0: return 0 summation = 0 for pos in pos2countDict: p = float(pos2countDict[pos]) / totalCount if p == 0.0: continue summation += p * math.log(p, 2) if summation == 0: return 0 return -summation def isConfidentJunction(block_list, multiJcnBlockList, confidence_score, jcn_chr, jcn_start, jcn_end, forced_jcns): """ Confidence is given as the entropy score associated with each junction. """ if forced_jcns: jcn_str = "%s:%d-%d" % (jcn_chr, jcn_start, jcn_end) if jcn_str in forced_jcns: return True all_blocks = list(block_list) all_blocks.extend(multiJcnBlockList) pos2count = getPos2Count(all_blocks) totalCount = len(all_blocks) entropy = getShannonIndex(pos2count, totalCount) if entropy >= confidence_score: return True else: return False def isMultiMapped(tags): NHtag_found = False for tag in tags: if tag.startswith("NH"): NHtag_found = True if tag.split(":")[-1] != "1": return True if not NHtag_found: return None return False def isPairedRead(flag): # If read is first in a pair if flag & 64: return True # If read is second in a pair if flag & 128: return True return False ################# # END FUNCTIONS # ################# if __name__ == "__main__": main()