#!/usr/local/bin/python3 # Michael Matschiner, 2015-03-26 # michaelmatschiner@mac.com # Import libraries and make sure we're on python 3. import sys if sys.version_info[0] < 3: print('Python 3 is needed to run this script!') sys.exit(1) import argparse, textwrap, random, tempfile, os, subprocess, statistics, re # Parse the command line arguments. parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ %(prog)s ----------------------------------------- This script uses one fasta format query file with one or more sequence entries and one or more subject sequence files that must also be in fasta format, and must have blast databases prepared for them with makeblastdb. Putative orthologs to query sequences are detected in subject sequences based on variable settings for strictness and evalue and/or bitscore thresholds. The query sequence(s) can be divided into sliding windows. Blast results are used to build a (multiple) sequence alignment and exported in fasta or phylip format. Note that blast+ must be installed on your system for this script to run (also, blast+ 2.2.26 and 2.2.30 work, but 2.2.29 throws an error when the query is composed of only missing data). On the other hand, blast+ 2.2.26 is not compatible with genetic codes other than the standard code. ''')) parser.add_argument( '-v', '--version', action='version', version='%(prog)s 0.77', help='show program version number and exit.' ) parser.add_argument( 'query', nargs=1, type=argparse.FileType('r'), help='the query file name (fasta).' ) parser.add_argument( 'subject', nargs='+', type=str, help="one or more blast database names. If a single file name is provided \ and the first character of the file is not '>', it is assumed to be a list \ of blast database names.") parser.add_argument( '-s', '--strictness', nargs=1, type=str, default=[0], metavar="INT", dest='strictness', help="strictness setting for ortholog identification. \ 0: use all subjects with sufficient fit.\ 1: per query fragment region, use the best subject with sufficient fit.\ 2: as 1, but all subjects must be on the same scaffold/contig, have the \ same orientation, and be ordered the same way in the query and the subject." ) parser.add_argument( '-e', '--evalue', nargs=1, type=float, default=[None], metavar="FLOAT", dest='evalue', help="evalue threshold to consider blast hits (regardless of specified \ evalue, only the best blast hit per alignment region is used). Is \ overridden for query sequences that include '[&evalue=FLOAT]' (where \ FLOAT is a floating point number) in their sequence id." ) parser.add_argument( '-b', '--bitscore', nargs=1, type=float, default=[None], metavar="FLOAT", dest='bitscore', help="bitscore threshold to consider blast hits (regardless of specified \ bitscore, only the best blast hit per alignment region is used). Is \ overridden for query sequences that include '[&bitscore=FLOAT]' (where \ FLOAT is a floating point number) in their sequence id." ) parser.add_argument( '-t', '--translate', action='store_true', dest='translate', help="translate subject sequences (use with amino acid sequence query)." ) parser.add_argument( '-c', '--genetic-code', nargs=1, type=float, default=[None], metavar="INT", dest='genetic_code', help="If amino acid sequence queries are used as queries for translated \ subject sequences, the genetic code can be specified with this option. \ Note that at least blast+ 2.2.29 is required with this option.\ See here for a list of all available genetic code:\ http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?" ) parser.add_argument( '-wl', '--window-length', nargs=1, type=int, default=[None], metavar="INT", dest='window_length', help="length of the sliding window." ) parser.add_argument( '-ws', '--window-shift', nargs=1, type=int, default=[None], metavar="INT", dest='window_shift', help="sliding window shift distance in bp." ) parser.add_argument( '-r', '--refine', action='store_true', dest='refine', help="refine the alignment. Note that the software mafft must be installed\ in order to use this option." ) parser.add_argument( '-m', '--minimum-completeness', nargs=1, type=int, default=[None], metavar="INT", dest='minimum_completeness', help="minimum number of alignment sites without missing data." ) parser.add_argument( '--write-empty-files', action='store_true', dest='write_empty_files', help="write empty files if alignment is not sufficiently complete.\ By default, no files are written for these alignments." ) parser.add_argument( '-of', '--output-format', nargs=1, type=str, default=["fasta"], metavar="STR", dest='output_format', help="output alignment format: fasta (default) or phylip." ) parser.add_argument( '--overwrite', action='store_true', dest='overwrite', help="overwrite existing alignments." ) args = parser.parse_args() query = args.query[0] if len(args.subject) == 1: # Check if the file name specified for the subject is a list of # subject file names. with open(args.subject[0], "r") as f: first_line = f.readline() if first_line[0:1] == ">": subjects = args.subject else: subject_list_file = open(args.subject[0], "r") subject_list = subject_list_file.readlines() subjects = [] for subject_line in subject_list: subjects.append(subject_line.strip()) subject_list_file.close() else: subjects = args.subject # Make sure that all subjects files are present. for subject in subjects: if not os.path.isfile(subject): print("ERROR: Subject file " + subject + " could not be found!") sys.exit(1) elif not os.path.isfile(subject + ".nhr") and not os.path.isfile(subject + ".00.nhr"): print("ERROR: File " + subject + ".nhr for subject " + subject + " could not be found!") sys.exit(1) elif not os.path.isfile(subject + ".nin") and not os.path.isfile(subject + ".00.nin"): print("ERROR: File " + subject + ".nin for subject " + subject + " could not be found!") sys.exit(1) elif not os.path.isfile(subject + ".nsq") and not os.path.isfile(subject + ".00.nsq"): print("ERROR: File " + subject + ".nsq for subject " + subject + " could not be found!") sys.exit(1) strictness = int(args.strictness[0]) if args.evalue[0] != None: evalue = float(args.evalue[0]) else: evalue = None if args.bitscore[0] != None: bitscore = float(args.bitscore[0]) else: bitscore = None if args.genetic_code[0] != None: genetic_code = int(args.genetic_code[0]) else: genetic_code = None translate = args.translate window_length = args.window_length[0] window_shift = args.window_shift[0] if window_shift == None and window_length != None: window_shift = window_length refine = args.refine minimum_completeness = args.minimum_completeness[0] write_empty_files = args.write_empty_files output_format = args.output_format[0] overwrite = args.overwrite # Read the query fasta string and store all entries in memory. sys.stdout.flush() query_string = query.read() query_entries = query_string.split(">") query_ids = [] query_seqs = [] query_evalues = [] query_bitscores = [] pattern = re.compile('\[&(.+)\]') for query_entry in query_entries: if query_entry != "": query_entry_lines = query_entry.split("\n") pattern_match = pattern.search(query_entry_lines[0]) if pattern_match == None: query_ids.append(query_entry_lines[0]) query_evalues.append(evalue) query_bitscores.append(bitscore) else: query_ids.append(query_entry_lines[0].replace(pattern_match.group(0),"")) id_command = pattern_match.group(1) evalue_pattern = re.compile('evalue=([\d\.]+)') evalue_match = evalue_pattern.search(id_command) if evalue_match == None: query_evalues.append(evalue) else: query_evalues.append(float(evalue_match.group(1))) bitscore_pattern = re.compile('bitscore=([\d\.]+)') bitscore_match = bitscore_pattern.search(id_command) if bitscore_match == None: query_bitscores.append(bitscore) else: query_bitscores.append(float(bitscore_match.group(1))) query_seqs.append("".join(query_entry_lines[1:])) # Make sure the query sequences do not contain terminal stop codons. Terminal stop codons are accepted but removed. for x in range(len(query_seqs)): if query_seqs[x][-1] == "*": query_seqs[x] = query_seqs[x][:-2] if "*" in query_seqs[x]: print("ERROR: Non-terminal stop codon found in query sequence "+ query_ids[x] + "! These should be removed.") sys.exit(1) # Feedback. print("") print("find_orthologs.py") print("") print("Settings:") if len(query_ids) == 1: print(" query: " + query.name + " (" + str(len(query_ids)) + " sequence)") else: print(" query: " + query.name + " (" + str(len(query_ids)) + " sequences)") for subject in subjects: print(" subject: " + subject) if strictness == 0: print(" strictness: 0") elif strictness == 1: print(" strictness: 1") elif strictness == 2: print(" strictness: 2") else: print("ERROR: strictness must be 0, 1, or 2!") sys.exit(1) if evalue == None: print(" evalue: -") else: print(" evalue: " + str(evalue)) if bitscore == None: print(" bitscore: -") else: print(" bitscore: " + str(bitscore)) if translate == True: print(" translate: yes") else: print(" translate: no") if translate == False: print(" genetic_code: -") else: if genetic_code == None: print(" genetic_code: standard") elif genetic_code == 1: print(" genetic_code: standard") elif genetic_code == 2: print(" genetic_code: vertebrate mitochondrial") elif genetic_code == 3: print(" genetic_code: yeast mitochondrial") elif genetic_code == 5: print(" genetic_code: invertebrate mitochondrial") else: print(" genetic_code: other") if window_length == None: print(" window_length: -") else: print(" window_length: " + str(window_length)) if window_shift == None: print(" window_shift: -") else: print(" window_shift: " + str(window_shift)) if refine: print(" refine: yes") else: print(" refine: no") if minimum_completeness == None: print(" minimum_completeness: -") else: print(" minimum_completeness: " + str(minimum_completeness)) if write_empty_files: print(" write_empty_files: yes") else: print(" write_empty_files: no") print(" output_format: " + str(output_format)) if overwrite: print(" overwrite: yes") else: print(" overwrite: no") print("") sys.stdout.flush() # If a window length is specified, split each query seq into windows of this size. query_window_ids = [] query_window_seqs = [] if window_length != None: print("Splitting the query into windows...", end="") sys.stdout.flush() for x in range(len(query_ids)): window_start_pos = 0 window_end_pos = window_length-1 while window_end_pos < len(query_seqs[x]): window_pos_string = "_W" + str(window_start_pos) + "_" + str(window_end_pos) query_window_ids.append(query_ids[x] + window_pos_string) query_window_seqs.append(query_seqs[x][window_start_pos:window_end_pos+1]) window_start_pos += window_shift window_end_pos += window_shift print(" done.") else: query_window_ids = query_ids query_window_seqs = query_seqs # Go through all sequence windows and use the sequence for BLAST searches against all subjects. for z in range(len(query_window_ids)): # Determine the file name. if output_format == "phylip": aln_file_name = query_window_ids[z] + ".phy" else: aln_file_name = query_window_ids[z] + ".fasta" # If the file already exists, skip to next query window. if os.path.isfile(aln_file_name) and overwrite == False: print("Skipping blast search for file " + aln_file_name + " as file exists already.") continue # Prepare a string in fasta format for this query window. query_string = ">" + query_window_ids[z] + "\n" + query_window_seqs[z] # Write the sequence to a temporary file, so that it can be used as a query for BLAST searches. tmp_query = tempfile.NamedTemporaryFile(delete=False) tmp_query.write(query_string.encode('utf-8')) tmp_query.close() # Prepare lists to store the lists containing gap opening positions, and the lists # containing the aligned subject sequences. gaps_opened_in_query_per_subject = [] sseqids_per_subject = [] bitscores_per_subject = [] if strictness > 0: number_of_hits_per_subject = [] aligned_subject_seq_per_subject = [] if translate: nucl_subject_seq_per_subject = [] aligned_nucl_subject_seq_per_subject = [] # Run BLAST searches against each subject. for subject in subjects: # Feedback. if window_length != None: print("Running blast searches for window " + query_window_ids[z] + " in subject " + subject + "...", end="") sys.stdout.flush() else: print("Running blast searches for query " + query_window_ids[z] + " in subject " + subject + "...", end="") sys.stdout.flush() # Prepare the BLAST command as array. FNULL = open(os.devnull, 'w') if translate: blast_ary = ["tblastn"] else: blast_ary = ["blastn"] if genetic_code != None: blast_ary.append("-db_gencode") blast_ary.append(str(genetic_code)) blast_ary.append("-query") blast_ary.append(tmp_query.name) blast_ary.append("-db") blast_ary.append(subject) if query_evalues[z] != None: blast_ary.append("-evalue") blast_ary.append(str(query_evalues[z])) if strictness > 0: blast_ary.append("-culling_limit") blast_ary.append("1") blast_ary.append("-outfmt") blast_ary.append("6 qseqid sseqid evalue bitscore pident qstart qend sstart send qseq sseq") # Call BLAST. blast_out = subprocess.check_output(blast_ary).decode("utf-8").strip() blast_lines = blast_out.split("\n") if blast_lines == [""]: print("done.") aligned_subject_seq = "" for w in range(len(query_window_seqs[z])): aligned_subject_seq += "-" gaps_opened_in_query_per_subject.append([]) if strictness == 0: sseqids_per_subject.append(["None"]) else: sseqids_per_subject.append("None") if strictness == 0: bitscores_per_subject.append(["None"]) else: bitscores_per_subject.append("None") if strictness > 0: number_of_hits_per_subject.append(0) aligned_subject_seq_per_subject.append(aligned_subject_seq) if translate: if strictness == 0: nucl_subject_seq_per_subject.append([]) else: nucl_subject_seq_per_subject.append("") else: qseqids = [] sseqids = [] evalues = [] bitscores = [] pidents = [] qstarts = [] qends = [] sstarts = [] sends = [] qseqs = [] sseqs = [] for blast_line in blast_lines: blast_line_ary = blast_line.split("\t") if len(blast_line_ary) == 11: qseqids.append(blast_line_ary[0]) sseqids.append(blast_line_ary[1]) evalues.append(float(blast_line_ary[2])) bitscores.append(float(blast_line_ary[3])) pidents.append(float(blast_line_ary[4])) qstarts.append(int(blast_line_ary[5])) qends.append(int(blast_line_ary[6])) sstarts.append(int(blast_line_ary[7])) sends.append(int(blast_line_ary[8])) qseqs.append(blast_line_ary[9]) sseqs.append(blast_line_ary[10]) # If a bitscore threshold had been specified, remove all hits below # this threshold. (The same does not need to be done for evalue # thresholds, as this can be filtered for when running the blast search). if query_bitscores[z] != None: tmp_qseqids = [] tmp_sseqids = [] tmp_evalues = [] tmp_bitscores = [] tmp_pidents = [] tmp_qstarts = [] tmp_qends = [] tmp_sstarts = [] tmp_sends = [] tmp_qseqs = [] tmp_sseqs = [] for x in range(len(qseqids)): if bitscores[x] >= query_bitscores[z]: tmp_qseqids.append(qseqids[x]) tmp_sseqids.append(sseqids[x]) tmp_evalues.append(evalues[x]) tmp_bitscores.append(bitscores[x]) tmp_pidents.append(pidents[x]) tmp_qstarts.append(qstarts[x]) tmp_qends.append(qends[x]) tmp_sstarts.append(sstarts[x]) tmp_sends.append(sends[x]) tmp_qseqs.append(qseqs[x]) tmp_sseqs.append(sseqs[x]) qseqids = tmp_qseqids sseqids = tmp_sseqids evalues = tmp_evalues bitscores = tmp_bitscores pidents = tmp_pidents qstarts = tmp_qstarts qends = tmp_qends sstarts = tmp_sstarts sends = tmp_sends qseqs = tmp_qseqs sseqs = tmp_sseqs # If tblastn was used, remove all hits that contain a stop codon (*). if translate: tmp_qseqids = [] tmp_sseqids = [] tmp_evalues = [] tmp_bitscores = [] tmp_pidents = [] tmp_qstarts = [] tmp_qends = [] tmp_sstarts = [] tmp_sends = [] tmp_qseqs = [] tmp_sseqs = [] for x in range(len(qseqids)): if "*" not in sseqs[x]: tmp_qseqids.append(qseqids[x]) tmp_sseqids.append(sseqids[x]) tmp_evalues.append(evalues[x]) tmp_bitscores.append(bitscores[x]) tmp_pidents.append(pidents[x]) tmp_qstarts.append(qstarts[x]) tmp_qends.append(qends[x]) tmp_sstarts.append(sstarts[x]) tmp_sends.append(sends[x]) tmp_qseqs.append(qseqs[x]) tmp_sseqs.append(sseqs[x]) qseqids = tmp_qseqids sseqids = tmp_sseqids evalues = tmp_evalues bitscores = tmp_bitscores pidents = tmp_pidents qstarts = tmp_qstarts qends = tmp_qends sstarts = tmp_sstarts sends = tmp_sends qseqs = tmp_qseqs sseqs = tmp_sseqs # If not a single blast line was added (after bitscore filtering), # return an empty sequence. if qseqids == []: aligned_subject_seq = "" for w in range(len(query_window_seqs[z])): aligned_subject_seq += "-" gaps_opened_in_query_per_subject.append([]) if strictness == 0: sseqids_per_subject.append(["None"]) else: sseqids_per_subject.append("None") if strictness == 0: bitscores_per_subject.append(["None"]) else: bitscores_per_subject.append("None") if strictness > 0: number_of_hits_per_subject.append(0) aligned_subject_seq_per_subject.append(aligned_subject_seq) if translate: if strictness == 0: nucl_subject_seq_per_subject.append([]) else: nucl_subject_seq_per_subject.append("") print(" done.") # Otherwise continue analysing the blast hits. else: # Find the contig/scaffold with most hits (only with strictness = 2). if strictness == 2: uniq_sseqids = sorted(list(set(sseqids))) uniq_sseqid_occurrences = [] for uniq_sseqid in uniq_sseqids: uniq_sseqid_occurrences.append(sseqids.count(uniq_sseqid)) best_sseqid = uniq_sseqids[uniq_sseqid_occurrences.index(max(uniq_sseqid_occurrences))] # Remove hits for all other contigs/scaffolds (only with strictness = 2). if strictness == 2: tmp_qseqids = [] tmp_sseqids = [] tmp_evalues = [] tmp_bitscores = [] tmp_pidents = [] tmp_qstarts = [] tmp_qends = [] tmp_sstarts = [] tmp_sends = [] tmp_qseqs = [] tmp_sseqs = [] for x in range(len(qseqids)): if sseqids[x] == best_sseqid: tmp_qseqids.append(qseqids[x]) tmp_sseqids.append(sseqids[x]) tmp_evalues.append(evalues[x]) tmp_bitscores.append(bitscores[x]) tmp_pidents.append(pidents[x]) tmp_qstarts.append(qstarts[x]) tmp_qends.append(qends[x]) tmp_sstarts.append(sstarts[x]) tmp_sends.append(sends[x]) tmp_qseqs.append(qseqs[x]) tmp_sseqs.append(sseqs[x]) qseqids = tmp_qseqids sseqids = tmp_sseqids evalues = tmp_evalues bitscores = tmp_bitscores pidents = tmp_pidents qstarts = tmp_qstarts qends = tmp_qends sstarts = tmp_sstarts sends = tmp_sends qseqs = tmp_qseqs sseqs = tmp_sseqs # Determine the orientation for all hits (only with strictness = 2). if strictness == 2: sorients = [] for x in range(len(qseqids)): if qstarts[x] < qends[x]: if sstarts[x] < sends[x]: sorients.append("for") else: sorients.append("rev") else: if sstarts[x] < sends[x]: sorients.append("rev") else: sorients.append("for") # Find the more common orientation (only with strictness = 2). if strictness == 2: if sorients.count("rev") > sorients.count("for"): common_orient = "rev" else: common_orient = "for" # Discard hits that have the less common orientation (only with strictness = 2). if strictness == 2: tmp_qseqids = [] tmp_sseqids = [] tmp_evalues = [] tmp_bitscores = [] tmp_pidents = [] tmp_qstarts = [] tmp_qends = [] tmp_sstarts = [] tmp_sends = [] tmp_qseqs = [] tmp_sseqs = [] for x in range(len(qseqids)): if sorients[x] == common_orient: tmp_qseqids.append(qseqids[x]) tmp_sseqids.append(sseqids[x]) tmp_evalues.append(evalues[x]) tmp_bitscores.append(bitscores[x]) tmp_pidents.append(pidents[x]) tmp_qstarts.append(qstarts[x]) tmp_qends.append(qends[x]) tmp_sstarts.append(sstarts[x]) tmp_sends.append(sends[x]) tmp_qseqs.append(qseqs[x]) tmp_sseqs.append(sseqs[x]) qseqids = tmp_qseqids sseqids = tmp_sseqids evalues = tmp_evalues bitscores = tmp_bitscores pidents = tmp_pidents qstarts = tmp_qstarts qends = tmp_qends sstarts = tmp_sstarts sends = tmp_sends qseqs = tmp_qseqs sseqs = tmp_sseqs # Sort all arrays by the qstart. all_sorted = False while all_sorted == False: all_sorted = True for x in range(len(qseqids)-1): if qstarts[x] > qstarts[x+1]: all_sorted = False qseqids[x], qseqids[x+1] = qseqids[x+1], qseqids[x] sseqids[x], sseqids[x+1] = sseqids[x+1], sseqids[x] evalues[x], evalues[x+1] = evalues[x+1], evalues[x] bitscores[x], bitscores[x+1] = bitscores[x+1], bitscores[x] pidents[x], pidents[x+1] = pidents[x+1], pidents[x] qstarts[x], qstarts[x+1] = qstarts[x+1], qstarts[x] qends[x], qends[x+1] = qends[x+1], qends[x] sstarts[x], sstarts[x+1] = sstarts[x+1], sstarts[x] sends[x], sends[x+1] = sends[x+1], sends[x] qseqs[x], qseqs[x+1] = qseqs[x+1], qseqs[x] sseqs[x], sseqs[x+1] = sseqs[x+1], sseqs[x] # If two or more hits have overlapping query ranges, discard the one with lower evalue. if strictness > 0: no_overlaps = False while no_overlaps == False: no_overlaps = True for x in range(len(qseqids)-1): if qends[x] >= qstarts[x+1]: no_overlaps = False if evalues[x] < evalues[x+1]: # Remove hit x+1 from the dataset. del qseqids[x+1] del sseqids[x+1] del evalues[x+1] del bitscores[x+1] del pidents[x+1] del qstarts[x+1] del qends[x+1] del sstarts[x+1] del sends[x+1] del qseqs[x+1] del sseqs[x+1] else: # Remove hit x from the dataset. del qseqids[x] del sseqids[x] del evalues[x] del bitscores[x] del pidents[x] del qstarts[x] del qends[x] del sstarts[x] del sends[x] del qseqs[x] del sseqs[x] break # Determine the order for all queries (only with strictness = 2). if strictness == 2: qorder = [] order = 1 for x in range(len(qseqids)): qorder.append(order) order += 1 # Determine the order for all subjects (only with strictness = 2). if strictness == 2: sorder = [] sorted_sstarts = sorted(sstarts) if common_orient == "rev": sorted_sstarts.reverse() for x in range(len(qseqids)): sorder.append(sorted_sstarts.index(sstarts[x])+1) # Discard those hits that violate the order until the two orders are the same # (only with strictness = 2). if strictness == 2: while sorder != qorder: # Find the hits that violate the order the most. violation = [] for x in range(len(qseqids)): violation.append(abs(sorder[x]-qorder[x])) max_violation = max(violation) # Discard hits that violate the order the most. tmp_qseqids = [] tmp_sseqids = [] tmp_evalues = [] tmp_bitscores = [] tmp_pidents = [] tmp_qstarts = [] tmp_qends = [] tmp_sstarts = [] tmp_sends = [] tmp_qseqs = [] tmp_sseqs = [] for x in range(len(qseqids)): if violation[x] < max_violation: tmp_qseqids.append(qseqids[x]) tmp_sseqids.append(sseqids[x]) tmp_evalues.append(evalues[x]) tmp_bitscores.append(bitscores[x]) tmp_pidents.append(pidents[x]) tmp_qstarts.append(qstarts[x]) tmp_qends.append(qends[x]) tmp_sstarts.append(sstarts[x]) tmp_sends.append(sends[x]) tmp_qseqs.append(qseqs[x]) tmp_sseqs.append(sseqs[x]) qseqids = tmp_qseqids sseqids = tmp_sseqids evalues = tmp_evalues bitscores = tmp_bitscores pidents = tmp_pidents qstarts = tmp_qstarts qends = tmp_qends sstarts = tmp_sstarts sends = tmp_sends qseqs = tmp_qseqs sseqs = tmp_sseqs # Determine the order for all queries. qorder = [] order = 1 for x in range(len(qseqids)): qorder.append(order) order += 1 # Determine the order for all subjects. sorder = [] sorted_sstarts = sorted(sstarts) if common_orient == "rev": sorted_sstarts.reverse() for x in range(len(qseqids)): sorder.append(sorted_sstarts.index(sstarts[x])+1) # Initiate gaps_opened_in_query, a list to store the position of all gaps that # this subject (with strictness > 0) / these subjects (with strictness = 0) # cause(s) in the query sequence. # With strictness = 0, this list contains other lists, one for each # subject. if strictness == 0: # Initiate gaps_opened_in_query gaps_opened_in_query = [] # Initiate a list to store the starting positions of each subject sequence in # a multiple sequence alignment with gaps. sstarts_in_msa = [] # In order to know which gaps are opened in the query by each subject sequence, # the query sequences need to be checked for "-". for x in range(len(qseqids)): # Make the xth item of list gaps_opened_in_query a list in itself. gaps_opened_in_query.append([]) # Find out how many gaps have already been opened before the start position # of this query sequence region (=subject sequence region). gaps_opened_before_query_start = [] for alist in gaps_opened_in_query: for item in alist: if item < qstarts[x]: if item not in gaps_opened_before_query_start: gaps_opened_before_query_start.append(item) number_of_gaps_opened_before_query_start = len(gaps_opened_before_query_start) # For this query/subject sequence region, determine its start position in a # multiple sequence alignment containing gaps. sstarts_in_msa.append(qstarts[x] + number_of_gaps_opened_before_query_start) # For each position in the query/subject sequence region, see whether the subject # has opened a gap in its query. If so, if it's already stored in gaps_opened_in_query, # only register that this subject is also responsible for it (by storing it in # the list of gaps_opened_in_query that corresponds to this subject). If it's not yet # stored in gaps_opened_in_query, do so and shift all gaps at positions greater than # this one up by one. number_of_gaps_opened_up_to_here = number_of_gaps_opened_before_query_start for y in range(len(qseqs[x])): pos_in_region = y pos_in_alignment = y + qstarts[x] + number_of_gaps_opened_up_to_here # See whether there is a gap already known at this position in the alignment. gap_known_at_this_position = False for alist in gaps_opened_in_query: if pos_in_alignment in alist: gap_known_at_this_position = True # See whether there is a gap found at this position in this query: gap_found_in_this_query_at_this_position = False if qseqs[x][y] == "-": gap_found_in_this_query_at_this_position = True # 1.) If there is a gap already registered at this position and it is also found in this # query, then memorize that this subject is also responsible for the observed # gap (don't change number_of_gaps_opened_up_to_here). if gap_known_at_this_position == True and gap_found_in_this_query_at_this_position == True: gaps_opened_in_query[x].append(pos_in_alignment) # 2.) If there is no gap known for this position, but there is one found in this query, # then memorize that this subject is responsible for the gap, and shift all gaps # at positions greater than this one up by one (don't change number_of_gaps_opened_up_to_here). elif gap_known_at_this_position == False and gap_found_in_this_query_at_this_position == True: gaps_opened_in_query[x].append(pos_in_alignment) for alist in gaps_opened_in_query: for item in alist: if item > pos_in_alignment: item += 1 # 3.) If there is a gap known for this position, but this query does not have one, # increase the number_of_gaps_opened_up_to_here by one. elif gap_known_at_this_position == True and gap_found_in_this_query_at_this_position == False: number_of_gaps_opened_up_to_here += 1 # 4.) If there is no gap known for this position, and this query also does not have one, # move on. else: # If all subjects are sequential, as they are with strictness > 0, it's much easier. gaps_opened_in_query = [] sstarts_in_msa = [] for x in range(len(qseqids)): number_of_gaps_already_opened = len(gaps_opened_in_query) sstarts_in_msa.append(qstarts[x] + number_of_gaps_already_opened) for y in range(len(qseqs[x])): if qseqs[x][y] == "-": gaps_opened_in_query.append(qstarts[x]+y+number_of_gaps_already_opened) # Prepare the aligned sequence for this subject. if strictness == 0: aligned_subject_seqs = [] for x in range(len(qseqids)): aligned_subject_seq = "" while len(aligned_subject_seq) < sstarts_in_msa[x]-1: aligned_subject_seq += "-" pos_in_region = 0 pos_in_alignment = sstarts_in_msa[x] while pos_in_region < len(sseqs[x]): # See whether any subject has opened a gap in the alignment at this position. gap_known_at_this_position = False for alist in gaps_opened_in_query: if pos_in_alignment in alist: gap_known_at_this_position = True # See whether this subject is responsible for the gap in the alignment at this position. this_subject_is_responsible = False if pos_in_alignment in gaps_opened_in_query[x]: this_subject_is_responsible = True # 1.) If there is a gap in the alignment at this position and this subject # is responsible for it, just add this position of this subject to the # aligned sequence and move on. if gap_known_at_this_position == True and this_subject_is_responsible == True: aligned_subject_seq += sseqs[x][pos_in_region] pos_in_region += 1 pos_in_alignment += 1 # 2.) If there is a gap in the alignment at this position and this subject is # not responsible for it, add a gap to the aligned sequence. elif gap_known_at_this_position == True and this_subject_is_responsible == False: aligned_subject_seq += "-" pos_in_alignment += 1 # 3.) If there is no gap in the alignment at this position, just add this position # of this subject to the aligned sequence and move on. elif gap_known_at_this_position == False: aligned_subject_seq += sseqs[x][pos_in_region] pos_in_region += 1 pos_in_alignment += 1 # See how many gaps have been opened in total. gaps_opened_in_query_flat = [] for alist in gaps_opened_in_query: for item in alist: if item not in gaps_opened_in_query_flat: gaps_opened_in_query_flat.append(item) total_number_of_opened_gaps = len(gaps_opened_in_query_flat) alignment_length = len(query_window_seqs[z]) + total_number_of_opened_gaps # Add missing data to the end of the alignment. while pos_in_alignment <= alignment_length: aligned_subject_seq += "-" pos_in_alignment += 1 # Store the aligned sequence for this subject's sequence. aligned_subject_seqs.append(aligned_subject_seq) else: aligned_subject_seq = "" for x in range(len(qseqids)): while len(aligned_subject_seq) < sstarts_in_msa[x]-1: aligned_subject_seq += "-" aligned_subject_seq += sseqs[x] while len(aligned_subject_seq) < len(query_window_seqs[z]) + len(gaps_opened_in_query): aligned_subject_seq += "-" # Combine the subject ids in a string to be included in the fasta id. if strictness > 0: if strictness == 1: combined_sseqids = "" for sseqid in sseqids: combined_sseqids += sseqid combined_sseqids += "|" combined_sseqids = combined_sseqids[:-1] elif strictness == 2: combined_sseqids == sseqid[0] else: print("ERROR: Unknown strictness setting: " + strictness + "!") sys.exit(1) # Calculate the combined bitscore to be included in the fasta id. if strictness > 0: combined_bitscore = 0 for bitscore in bitscores: combined_bitscore += bitscore # Store the results from this subject. # sseqids_per_subject, bitscores_per_subject, aligned_subject_seq_per_subject all represent # lists of lists when strictness is 0, and lists of values when strictness is 1 or 2. if strictness == 0: gaps_opened_in_query_flat = [] for alist in gaps_opened_in_query: for item in alist: if item not in gaps_opened_in_query_flat: gaps_opened_in_query_flat.append(item) gaps_opened_in_query_per_subject.append(gaps_opened_in_query_flat) else: gaps_opened_in_query_per_subject.append(gaps_opened_in_query) if strictness == 0: sseqids_per_subject.append(sseqids) else: sseqids_per_subject.append(combined_sseqids) if strictness == 0: bitscores_per_subject.append(bitscores) else: bitscores_per_subject.append(combined_bitscore) if strictness > 0: number_of_hits_per_subject.append(len(qseqids)) if strictness == 0: aligned_subject_seq_per_subject.append(aligned_subject_seqs) else: aligned_subject_seq_per_subject.append(aligned_subject_seq) # If tblastn was used, the nucleotide sequence has to be obtained from the subject # fasta file. if translate: print(" done.") print("Reading subject " + subject + " to retrieve nucleotide sequence...", end="") sys.stdout.flush() if strictness == 0: nucl_subject_seqs = [] else: nucl_subject_seq = "" for x in range(len(qseqids)): with open(subject) as f: subject_nucl_seq = "" in_seq = False for line in f: if line[0:len(sseqids[x])+1] == ">" + sseqids[x]: if line[len(sseqids[x])+1] == " " or line[len(sseqids[x])+1] == "\n": in_seq = True elif in_seq == True and line.strip() != "": if line[0] == ">": break else: subject_nucl_seq += line.strip() if subject_nucl_seq == "": print("ERROR: The nucleotide sequence " + sseqids[x] + " could not be found in subject " + subject + "!") sys.exit(1) if sstarts[x] < sends[x]: nucl_subject_seq_for_this_hit = subject_nucl_seq[sstarts[x]-1:sends[x]] else: nucl_subject_seq_for_this_hit = subject_nucl_seq[sends[x]-1:sstarts[x]] # Make reverse complement. tmp = "" pos = len(nucl_subject_seq_for_this_hit) while pos > 0: pos -= 1 if nucl_subject_seq_for_this_hit[pos].upper() == "A": tmp += "T" elif nucl_subject_seq_for_this_hit[pos].upper() == "C": tmp += "G" elif nucl_subject_seq_for_this_hit[pos].upper() == "G": tmp += "C" elif nucl_subject_seq_for_this_hit[pos].upper() == "T": tmp += "A" elif nucl_subject_seq_for_this_hit[pos].strip() != "": tmp += "N" nucl_subject_seq_for_this_hit = tmp if strictness == 0: nucl_subject_seqs.append(nucl_subject_seq_for_this_hit) else: nucl_subject_seq += nucl_subject_seq_for_this_hit if strictness == 0: nucl_subject_seq_per_subject.append(nucl_subject_seqs) else: nucl_subject_seq_per_subject.append(nucl_subject_seq) print(" done.") # Produce the alignment with the query and all subject sequences. print("Producing alignment...", end="") reached_the_end = False pos = 0 aligned_ref_seq = query_window_seqs[z] while reached_the_end == False: # Increment the current position. pos += 1 # Test whether any (and if so, which) of the subject has opened a gap at this position. subjects_responsible_for_gap_opening_at_this_pos = [] for subject_count in range(len(subjects)): if pos in gaps_opened_in_query_per_subject[subject_count]: subjects_responsible_for_gap_opening_at_this_pos.append(subject_count) # If there is indeed a gap opening at this position. if len(subjects_responsible_for_gap_opening_at_this_pos) > 0: # Insert a gap into the query sequence. tmp_aligned_ref_seq = aligned_ref_seq[:(pos-1)] + "-" + aligned_ref_seq[(pos-1):] aligned_ref_seq = tmp_aligned_ref_seq # Insert a gap into all subject sequences unless they are responsible for the gap. for subject_count in range(len(subjects)): if subject_count not in subjects_responsible_for_gap_opening_at_this_pos: if strictness == 0: for sseq_count in range(len(aligned_subject_seq_per_subject[subject_count])): aligned_subject_seq in aligned_subject_seq_per_subject[subject_count][sseq_count] tmp_aligned_subject_seq = aligned_subject_seq[:(pos-1)] + "-" + aligned_subject_seq[(pos-1):] aligned_subject_seq_per_subject[subject_count][sseq_count] = tmp_aligned_subject_seq # Increase all gap positions greater than the current pos by 1. tmp_gaps_opened_in_query = [] for gap_pos in gaps_opened_in_query_per_subject[subject_count]: if gap_pos > pos: tmp_gaps_opened_in_query.append(gap_pos+1) else: tmp_gaps_opened_in_query.append(gap_pos) gaps_opened_in_query_per_subject[subject_count] = tmp_gaps_opened_in_query else: aligned_subject_seq = aligned_subject_seq_per_subject[subject_count] tmp_aligned_subject_seq = aligned_subject_seq[:(pos-1)] + "-" + aligned_subject_seq[(pos-1):] aligned_subject_seq_per_subject[subject_count] = tmp_aligned_subject_seq # Increase all gap positions greater than the current pos by 1. tmp_gaps_opened_in_query = [] for gap_pos in gaps_opened_in_query_per_subject[subject_count]: if gap_pos > pos: tmp_gaps_opened_in_query.append(gap_pos+1) else: tmp_gaps_opened_in_query.append(gap_pos) gaps_opened_in_query_per_subject[subject_count] = tmp_gaps_opened_in_query # Stop when the current position has reached the length of the aligned query seq. if pos >= len(aligned_ref_seq): reached_the_end = True # Collect the alignment entries. aln_ids = [] aln_seqs = [] aln_ids.append(query_window_ids[z]) aln_seqs.append(aligned_ref_seq) if strictness == 0: for subject_count in range(len(subjects)): for sseqids_count in range(len(sseqids_per_subject[subject_count])): aln_id = subjects[subject_count].split("/")[-1].replace(".fasta","").replace(".ctg","").replace(".utg","") aln_id += "[&sseqid=" + str(sseqids_per_subject[subject_count][sseqids_count]) + "," aln_id += "bitscore=" + str(bitscores_per_subject[subject_count][sseqids_count]) aln_id += "]" aln_ids.append(aln_id) aln_seqs.append(aligned_subject_seq_per_subject[subject_count][sseqids_count]) else: for subject_count in range(len(subjects)): aln_id = subjects[subject_count].split("/")[-1].replace(".fasta","").replace(".ctg","").replace(".utg","") aln_id += "[&sseqid=" + str(sseqids_per_subject[subject_count]) + "," aln_id += "bitscore=" + str(bitscores_per_subject[subject_count]) + "," aln_id += "nhits=" + str(number_of_hits_per_subject[subject_count]) aln_id += "]" aln_ids.append(aln_id) aln_seqs.append(aligned_subject_seq_per_subject[subject_count]) print(" done.") if refine: # User mafft to refine the alignment. print("Refining the alignment with mafft...", end="") tmp_aln_string = "" for aln_count in range(len(aln_ids)): tmp_aln_string += ">" + aln_ids[aln_count].replace(" ","_") tmp_aln_string += "\n" tmp_aln_string += aln_seqs[aln_count] tmp_aln_string += "\n" tmp_alignment_file = tempfile.NamedTemporaryFile(delete=False) tmp_alignment_file.write(tmp_aln_string.encode('utf-8')) tmp_alignment_file.close() mafft_ary = ["mafft"] mafft_ary.append("--auto") mafft_ary.append("--quiet") mafft_ary.append(tmp_alignment_file.name) mafft_out = subprocess.check_output(mafft_ary).decode("utf-8").strip() refined_aln_ids = [] refined_aln_seqs = [] mafft_lines = mafft_out.split("\n") for mafft_line in mafft_lines: if mafft_line[0] == ">": refined_aln_ids.append(mafft_line.strip()[1:]) refined_aln_seqs.append("") elif mafft_line.strip() != "": refined_aln_seqs[-1] += mafft_line.strip() # Make sure that the alignment ids haven't changed. if aln_ids != refined_aln_ids: print("ERROR: Alignment ids have changed after mafft refinement! (before: " + aln_ids[0] + ", " + aln_ids[1] + ", " + aln_ids[2] + "..." + "; after:" + refined_aln_ids[0] + ", " + refined_aln_ids[1] + ", " + refined_aln_ids[2] + "...)") sys.exit(1) if len(aln_seqs) != len(refined_aln_seqs): print("ERROR: The number of sequences has changed after mafft refinement!") for x in range(len(aln_seqs)): if aln_seqs[x].replace("-","") != refined_aln_seqs[x].replace("-",""): print("ERROR: The composition of sequence " + aln_ids[x] + " has changed after mafft refinement!") print("Before mafft refinement:") print(aln_seqs[x]) print() print("After mafft refinement:") print(refined_aln_seqs[x]) sys.exit(1) aln_ids = refined_aln_ids aln_seqs = refined_aln_seqs print(" done.") # Check whether the minimum completeness is met by this alignment. write_this_alignment = True if minimum_completeness != None: print("Checking whether alignment completeness is sufficient...", end="") number_of_complete_sites = 0 for pos in range(len(aln_seqs[0])): # Check whether there are gaps at this position. gaps_at_this_pos = False for aln_seq in aln_seqs: if aln_seq[pos] == "-": gaps_at_this_pos = True break # If there are no gaps at this position, increase the counter. if gaps_at_this_pos == False: number_of_complete_sites += 1 if number_of_complete_sites < minimum_completeness: write_this_alignment = False print(" done.") # If tblastn was used, use the amino acid alignment and the obtained nucleotide sequence # to produce aligned nucleotide sequences. Leave out the first sequence, which is the # amino acid query. if translate and write_this_alignment: print("Producing nucleotide alignment based on amino acid alignment...", end="") untranslated_aln_ids = aln_ids[1:] untranslated_aln_seqs = [] aln_index = 0 for x in range(len(nucl_subject_seq_per_subject)): if strictness == 0: for y in range(len(nucl_subject_seq_per_subject[x])): aln_index += 1 aligned_nucl_subject_seq_per_subject = "" # Make sure that the nucleotide sequence contains three times the number of sites # of the amino acid sequence. if len(nucl_subject_seq_per_subject[x][y]) != 3 * len(aln_seqs[aln_index].replace("-","")): print("ERROR: The length of the amino acid and nucleotide sequcences don't match!") print("ERROR: " + aln_ids[aln_index]) print("ERROR: " + nucl_subject_seq_per_subject[x][y]) print("ERROR: " + aln_seqs[aln_index]) sys.exit(1) nucl_pos = 0 for aa_pos in range(len(aln_seqs[aln_index])): if aln_seqs[aln_index][aa_pos] == "-": aligned_nucl_subject_seq_per_subject += "---" else: aligned_nucl_subject_seq_per_subject += nucl_subject_seq_per_subject[x][y][nucl_pos:nucl_pos+3] nucl_pos += 3 untranslated_aln_seqs.append(aligned_nucl_subject_seq_per_subject) else: aln_index += 1 aligned_nucl_subject_seq_per_subject = "" # Make sure that the nucleotide sequence contains three times the number of sites # of the amino acid sequence. if len(nucl_subject_seq_per_subject[x]) != 3 * len(aln_seqs[aln_index].replace("-","")): print("ERROR: The length of the amino acid and nucleotide sequcences don't match!") print("ERROR: " + aln_ids[aln_index]) print("ERROR: " + nucl_subject_seq_per_subject[x]) print("ERROR: " + aln_seqs[aln_index]) sys.exit(1) nucl_pos = 0 for aa_pos in range(len(aln_seqs[x+1])): if aln_seqs[x+1][aa_pos] == "-": aligned_nucl_subject_seq_per_subject += "---" else: aligned_nucl_subject_seq_per_subject += nucl_subject_seq_per_subject[x][nucl_pos:nucl_pos+3] nucl_pos += 3 untranslated_aln_seqs.append(aligned_nucl_subject_seq_per_subject) # Remove gap-only sites. if len(untranslated_aln_seqs) > 0: tmp_untranslated_aln_seqs = [] for untranslated_aln_seq in untranslated_aln_seqs: tmp_untranslated_aln_seqs.append("") for x in range(len(untranslated_aln_seqs[0])): gap_only_site = True for untranslated_aln_seq in untranslated_aln_seqs: if untranslated_aln_seq[x] != "-": gap_only_site = False break if gap_only_site == False: for y in range(len(untranslated_aln_seqs)): tmp_untranslated_aln_seqs[y] += untranslated_aln_seqs[y][x] untranslated_aln_seqs = tmp_untranslated_aln_seqs print(" done.") # If the alignment is ok, prepare it. aln_string = "" if translate: untranslated_aln_string = "" if write_this_alignment == True: # Prepare the alignment string. if output_format == "phylip": aln_string += str(len(aln_ids)) + " " + str(len(aln_seqs[0])) + "\n" longest_id_length = 0 for aln_id in aln_ids: if len(aln_id) > longest_id_length: longest_id_length = len(aln_id) for aln_count in range(len(aln_ids)): aln_string += aln_ids[aln_count].replace(" ","_").ljust(longest_id_length+2) aln_string += aln_seqs[aln_count] aln_string += "\n" # If tblastn was used, also prepare the nucleotide sequence alignment, which has one # record less than the amino acid sequene alignment, as the amino acid query is not # included. if translate: if len(untranslated_aln_seqs) > 0: untranslated_aln_string += str(len(untranslated_aln_ids)) + " " + str(len(untranslated_aln_seqs[0])) + "\n" for untranslated_aln_count in range(len(untranslated_aln_ids)): untranslated_aln_string += untranslated_aln_ids[untranslated_aln_count].replace(" ","_").ljust(longest_id_length+2) untranslated_aln_string += untranslated_aln_seqs[untranslated_aln_count] untranslated_aln_string += "\n" else: for aln_count in range(len(aln_ids)): aln_string += ">" + aln_ids[aln_count].replace(" ","_") aln_string += "\n" aln_string += aln_seqs[aln_count] aln_string += "\n" # If tblastn was used, also prepare the nucleotide sequence alignment, which has one # record less than the amino acid sequene alignment, as the amino acid query is not # included. if translate: if len(untranslated_aln_seqs) > 0: for untranslated_aln_count in range(len(untranslated_aln_ids)): untranslated_aln_string += ">" + untranslated_aln_ids[untranslated_aln_count].replace(" ","_") untranslated_aln_string += "\n" untranslated_aln_string += untranslated_aln_seqs[untranslated_aln_count] untranslated_aln_string += "\n" # Write the alignment to file. if write_this_alignment or write_empty_files: aln_file = open(aln_file_name, 'w') aln_file.write(aln_string) aln_file.close() # If tblastn was used, also write the nucleotide sequence alignment file. if translate: untranslated_aln_file_name = aln_file_name.replace(".fasta","_nucl.fasta") untranslated_aln_file = open(untranslated_aln_file_name, 'w') untranslated_aln_file.write(untranslated_aln_string) untranslated_aln_file.close() # Feedback. if write_this_alignment: print("Wrote file " + aln_file_name) if translate: print("Wrote file " + untranslated_aln_file_name) sys.stdout.flush() else: print("Insufficient completeness (" + str(number_of_complete_sites) + " sites) for " + query_window_ids[z]) sys.stdout.flush() if write_empty_files: print("Wrote empty file " + aln_file_name) if translate: print("Wrote empty file " + untranslated_aln_file_name) sys.stdout.flush()