#!/usr/bin/env python # Convert GFFv3 to Sequin TBL # Usage: gff3-to-tbl project.gff product.tsv protein.fa >project.tbl # Written by Shaun Jackman . from BCBio import GFF from Bio import SeqIO import argparse import csv import re import sys genbank_regex = re.compile(r"(?:INSD:|_prot_)([A-Z]+[0-9]+\.[0-9])") refseq_regex = re.compile(r"(?:RefSeq:|_prot_)([A-Z]+_[0-9]+\.[0-9])") uniprot_regex = re.compile(r"UniProtKB:([A-Z0-9]+)") def print_coord(f, incomplete): """ Print the coordinates of this feature. """ coord = (f.location.start + 1, f.location.end) if f.strand == 1 else (f.location.end, f.location.start + 1) flag = ("<" if incomplete[0] else "", ">" if incomplete[1] else "") print "%s%d\t%s%d" % (flag[0], coord[0], flag[1], coord[1]) def print_coord_override_type(f, type, incomplete): """ Print the coordinates of this feature and a type. """ coord = (f.location.start + 1, f.location.end) if f.strand == 1 else (f.location.end, f.location.start + 1) flag = ("<" if incomplete[0] else "", ">" if incomplete[1] else "") print "%s%d\t%s%d\t%s" % (flag[0], coord[0], flag[1], coord[1], type) def print_coord_type(f, incomplete): """ Print the coordinates and type of this feature. """ print_coord_override_type(f, f.type, incomplete) def subfeatures(f): """ Iterate over the subfeatures of this feature. """ if f.strand == 1: return f.sub_features else: return reversed(f.sub_features) def print_mrna(f, incomplete, locus_tag, product): """ Print a mRNA feature. """ if len(f.sub_features) == 0: return cds = [sf for sf in subfeatures(f) if sf.type == "CDS"] for i, sf in enumerate(cds): if len(cds) == 1: # Single exon print_coord_type(sf, incomplete) elif i == 0: # First exon print_coord_type(sf, (incomplete[0], False)) elif i == len(cds) - 1: # Last exon print_coord(sf, (False, incomplete[1])) else: # Middle exon print_coord(sf, (False, False)) name = f.qualifiers["Name"][0] phase = int(cds[0].qualifiers["phase"][0]) if phase != 0: print "\t\t\tcodon_start\t%d" % (1 + phase) print "\t\t\tproduct\t%s" % product print "\t\t\tprotein_id\tgnl|%s|%s" % (args.centre, locus_tag) if "part" in f.qualifiers: print "\t\t\tnote\tpart=%s" % f.qualifiers["part"][0] # Check whether inference or Name contains evidence of a similar protein. inference = f.qualifiers["inference"][-1] if "inference" in f.qualifiers else name genbank_match = genbank_regex.search(inference) refseq_match = refseq_regex.search(inference) uniprot_match = uniprot_regex.search(inference) db = "INSD" if genbank_match else "RefSeq" if refseq_match else "UniProtKB" if uniprot_match else None match = genbank_match or refseq_match or uniprot_match if match: print "\t\t\tinference\tsimilar to AA sequence:%s:%s" % (db, match.group(1)) for tag in "exception", "note", "transl_except": if tag in f.qualifiers: for value in f.qualifiers[tag]: print "\t\t\t%s\t%s" % (tag, value) def print_trna(f, locus_tag, product): """ Print a tRNA gene. """ incomplete = (False, False) exons = [sf for sf in subfeatures(f) if sf.type == "exon"] for i, sf in enumerate(exons): if i == 0: print_coord_override_type(sf, "tRNA", incomplete) else: print_coord(sf, incomplete) name = f.qualifiers['Name'][0] anticodon = name.partition('-')[2] print "\t\t\tproduct\t%s" % product print "\t\t\tnote\tanticodon:%s" % anticodon for tag in "inference", "exception", "note": if tag in f.qualifiers: for value in f.qualifiers[tag]: print "\t\t\t%s\t%s" % (tag, value) def print_introns(f): incomplete = (False, False) for sf in subfeatures(f): if sf.type == 'intron': print_coord_type(sf, incomplete) def trans_spliced(f): """ Return whether this feature is trans-spliced. """ return "exception" in f.qualifiers and "trans-splicing" in f.qualifiers["exception"] def incomplete_cds(f, translations): """ Return whether this coding feature is incomplete. """ for sf in subfeatures(f): if sf.type == 'mRNA': if not 'ID' in sf.qualifiers: continue name = sf.qualifiers["Name"][0] id = sf.qualifiers["ID"][0] seq = translations[id] # ACG codes for T and is edited to AUG, which codes for M. acg_start = seq[0] == "T" \ and "exception" in sf.qualifiers and "note" in sf.qualifiers \ and "RNA editing" in sf.qualifiers["exception"] \ and "putative start codon created by RNA editing" in sf.qualifiers["note"] # GCG codes for A and is edited to GUG, which codes for V, an alternative start codon. gcg_start = seq[0] == "A" \ and "exception" in sf.qualifiers and "note" in sf.qualifiers \ and "RNA editing" in sf.qualifiers["exception"] \ and "putative alternative GUG start codon created by RNA editing" in sf.qualifiers["note"] # matR starts with a GGG codon, which codes for G. ggg_start = seq[0] == "G" and name == "matR" # rpl16 starts with a GUG codon, which codes for V. gtg_start = seq[0] == "V" and name == "rpl16" # CAG codes for Q and is edited to UAG, which is a stop codon. cag_stop = seq[-1] == "Q" \ and "exception" in sf.qualifiers and "note" in sf.qualifiers \ and "RNA editing" in sf.qualifiers["exception"] \ and "putative stop codon created by RNA editing" in sf.qualifiers["note"] good_start = seq[0] == "M" or acg_start or gcg_start or ggg_start or gtg_start good_stop = seq[-1] == "*" or cag_stop return (not good_start, not good_stop) return (False, False) def print_gene(f, gene_product_table, translations): """ Print a gene feature. """ incomplete = incomplete_cds(f, translations) if trans_spliced(f): # Print the coordinates of the trans-spliced mRNA. count = 0 for sf in subfeatures(f): count += 1 if count == 1: print_coord_override_type(sf, 'gene', incomplete) else: print_coord(sf, incomplete) else: print_coord_type(f, incomplete) name = re.sub("-gene$", "", f.qualifiers['Name'][0]) gene = re.sub("[-_][0-9]+$", "", re.sub(r"\|.*$", "", name)) locus_tag = "%s_%s" % (args.locustag, f.qualifiers['ID'][0]) if gene.startswith("orf"): product = "hypothetical protein" elif gene in gene_product_table: product = gene_product_table[gene] elif gene.startswith("trn") and gene[0:4] in gene_product_table: product = gene_product_table[gene[0:4]] elif gene.startswith("ymf") and gene[3:] in gene_product_table: product = "putative " + gene_product_table[gene[3:]] else: print >>sys.stderr, "Error: gene", gene, "not found in", args.gene_product_file sys.exit(1) print "\t\t\tgene\t%s" % gene print "\t\t\tlocus_tag\t%s" % locus_tag if 'exception' in f.qualifiers: print "\t\t\texception\t%s" % f.qualifiers['exception'][0] if 'pseudo' in f.qualifiers: print "\t\t\tpseudo" for sf in subfeatures(f): if sf.type == 'mRNA': print_mrna(sf, incomplete, locus_tag, product) print_introns(sf) elif sf.type == 'rRNA': print_coord_type(sf, incomplete) size = name.partition('rrn')[2] print "\t\t\tproduct\t%s" % product print_introns(sf) elif sf.type == 'tRNA': print_trna(sf, locus_tag, product) print_introns(sf) def main(args): gene_product_table = {} with open(args.gene_product_file) as tsvfile: reader = csv.reader(tsvfile, dialect = "excel-tab") header = reader.next() assert header == ["gene", "product"] for row in reader: gene_product_table[row[0]] = row[1] translations = {} fasta_sequences = SeqIO.parse(open(args.translation_file), 'fasta') for record in fasta_sequences: name, sequence = record.id, record.seq.tostring() translations[name] = sequence for rec in GFF.parse(args.gff_file): print ">Feature %s" % rec.id seen = {} for f in rec.features: if f.type != 'gene': continue if trans_spliced(f): # Skip the second copy of the trans-spliced gene. child = f.sub_features[0].qualifiers['ID'][0] if child in seen: continue seen[child] = True print_gene(f, gene_product_table, translations) parser = argparse.ArgumentParser(description="Convert GFFv3 to Sequin TBL") parser.add_argument("--centre", help="Sequencing centre ID") parser.add_argument("--locustag", help="Locus tag prefix") parser.add_argument("gff_file", metavar="project.gff", help="GFF file of annotations") parser.add_argument("gene_product_file", metavar="product.tsv", help="TSV of gene products") parser.add_argument("translation_file", metavar="protein.fa", help="FASTA of protein sequences") args = parser.parse_args() if __name__ == "__main__": main(args)