import argparse import gzip import os import time from string import strip import sys def name_search(input_tab, name): acc_list = [] with gzip.open(input_tab) as tab: for line in tab: s = map(strip, line.split("\t")) path = map(strip, s[1].split(";")) if name in path: acc_list.append(s[0]) return acc_list def fasta2dict(fasta_seq): acc2seq = {} acc = "" with gzip.open(fasta_seq) as seq_file: for line in seq_file: if line.startswith(">"): acc = line.split()[0][1:] acc2seq[acc] = [line.strip(), ""] else: acc2seq[acc][1] += line.strip() return acc2seq def fasta_parsing(fasta_file, acc_list, out_file): if out_file is None: folder = time.strftime("%b_%d_%Y_%H_%M_%S", time.localtime()) while os.path.exists(folder): folder = time.strftime("%b_%d_%Y_%H_%M_%S", time.localtime()) else: os.mkdir(folder) out_file = os.path.join(folder, "input_seq.fa") tmp = open(os.path.join(folder, "input_seq.fa"), "w") else: tmp = open(out_file, "w") seq_diz = fasta2dict(fasta_file) for accession in acc_list: tmp.write("%s\n" % seq_diz[accession][0]) tmp.write("%s\n" % seq_diz[accession][1]) tmp.close() return out_file def clustering(fasta_in): import subprocess, shlex, shutil cmd = shlex.split("vsearch --cluster_fast %s --id 0.97 --centroids tmp" % fasta_in) p = subprocess.Popen(cmd) p.wait() if os.path.exists("tmp") and os.stat("tmp")[6] != 0: shutil.move("tmp",fasta_in) return fasta_in if __name__ == "__main__": parser = argparse.ArgumentParser( description="This script extract the input ITS1 sequences according to a taxonomic name.", prefix_chars="-", usage="prepare_input_file2primer_inference.py -t 'Aspergillus Flavus' -f fasta_file -p taxonomy_file") parser.add_argument("-t", "--taxon_name", help="Taxon name [MANDATORY]", action="store", required=True) parser.add_argument("-p", "--tax_path", help="CSV file containing the association between the accession number and the taxonomic path " "[MANDATORY]", action="store", required=True) parser.add_argument("-f", "--fasta", help="FASTA file containing the ITS1 sequences and the flanking regions [MANDATORY]", action="store", required=True) parser.add_argument("-o", "--output_file", help="complete path and name of the output file", action="store", required=False, default=None) parser.add_argument("-c","--cluster",action='store_false', help= "If the number of selected ITS1 sequences is more than 1000, the sequences are clusterd at 97% of identity", default=True) args = parser.parse_args() taxa = args.taxon_name output = args.output_file # print str(taxa) fasta = args.fasta path_list = args.tax_path cluster = args.cluster lista_accession = name_search(path_list, taxa) if cluster and lista_accession >= 1000: print clustering(fasta_parsing(fasta, lista_accession, output)) else: print fasta_parsing(fasta, lista_accession, output) if os.stat(output).st_size == 0: sys.stderr.write('No output. The file is empty.') else: sys.stdout.write('DONE')