#!/usr/bin/env python
# Convert GFFv3 to Sequin TBL
# Usage: gff3-to-tbl project.gff product.tsv protein.fa >project.tbl
# Written by Shaun Jackman <sjackman@gmail.com>.

from BCBio import GFF
from Bio import SeqIO
import argparse
import csv
import re
import sys

genbank_regex = re.compile(r"(?:INSD:|_prot_)([A-Z]+[0-9]+\.[0-9])")
refseq_regex = re.compile(r"(?:RefSeq:|_prot_)([A-Z]+_[0-9]+\.[0-9])")
uniprot_regex = re.compile(r"UniProtKB:([A-Z0-9]+)")

def print_coord(f, incomplete):
	""" Print the coordinates of this feature.
	"""
	coord = (f.location.start + 1, f.location.end) if f.strand == 1 else (f.location.end, f.location.start + 1)
	flag = ("<" if incomplete[0] else "", ">" if incomplete[1] else "")
	print "%s%d\t%s%d" % (flag[0], coord[0], flag[1], coord[1])

def print_coord_override_type(f, type, incomplete):
	""" Print the coordinates of this feature and a type.
	"""
	coord = (f.location.start + 1, f.location.end) if f.strand == 1 else (f.location.end, f.location.start + 1)
	flag = ("<" if incomplete[0] else "", ">" if incomplete[1] else "")
	print "%s%d\t%s%d\t%s" % (flag[0], coord[0], flag[1], coord[1], type)

def print_coord_type(f, incomplete):
	""" Print the coordinates and type of this feature.
	"""
	print_coord_override_type(f, f.type, incomplete)

def subfeatures(f):
	""" Iterate over the subfeatures of this feature.
	"""
	if f.strand == 1:
		return f.sub_features
	else:
		return reversed(f.sub_features)

def print_mrna(f, incomplete, locus_tag, product):
	""" Print a mRNA feature.
	"""
	if len(f.sub_features) == 0:
		return
	cds = [sf for sf in subfeatures(f) if sf.type == "CDS"]
	for i, sf in enumerate(cds):
		if len(cds) == 1:
			# Single exon
			print_coord_type(sf, incomplete)
		elif i == 0:
			# First exon
			print_coord_type(sf, (incomplete[0], False))
		elif i == len(cds) - 1:
			# Last exon
			print_coord(sf, (False, incomplete[1]))
		else:
			# Middle exon
			print_coord(sf, (False, False))
	name = f.qualifiers["Name"][0]
	phase = int(cds[0].qualifiers["phase"][0])
	if phase != 0:
		print "\t\t\tcodon_start\t%d" % (1 + phase)
	print "\t\t\tproduct\t%s" % product
	print "\t\t\tprotein_id\tgnl|%s|%s" % (args.centre, locus_tag)
	if "part" in f.qualifiers:
		print "\t\t\tnote\tpart=%s" % f.qualifiers["part"][0]

	# Check whether inference or Name contains evidence of a similar protein.
	inference = f.qualifiers["inference"][-1] if "inference" in f.qualifiers else name
	genbank_match = genbank_regex.search(inference)
	refseq_match = refseq_regex.search(inference)
	uniprot_match = uniprot_regex.search(inference)
	db = "INSD" if genbank_match else "RefSeq" if refseq_match else "UniProtKB" if uniprot_match else None
	match = genbank_match or refseq_match or uniprot_match
	if match:
		print "\t\t\tinference\tsimilar to AA sequence:%s:%s" % (db, match.group(1))

	for tag in "exception", "note", "transl_except":
		if tag in f.qualifiers:
			for value in f.qualifiers[tag]:
				print "\t\t\t%s\t%s" % (tag, value)

def print_trna(f, locus_tag, product):
	""" Print a tRNA gene.
	"""
	incomplete = (False, False)
	exons = [sf for sf in subfeatures(f) if sf.type == "exon"]
	for i, sf in enumerate(exons):
		if i == 0:
			print_coord_override_type(sf, "tRNA", incomplete)
		else:
			print_coord(sf, incomplete)
	name = f.qualifiers['Name'][0]
	anticodon = name.partition('-')[2]
	print "\t\t\tproduct\t%s" % product
	print "\t\t\tnote\tanticodon:%s" % anticodon

	for tag in "inference", "exception", "note":
		if tag in f.qualifiers:
			for value in f.qualifiers[tag]:
				print "\t\t\t%s\t%s" % (tag, value)

def print_introns(f):
	incomplete = (False, False)
	for sf in subfeatures(f):
		if sf.type == 'intron':
			print_coord_type(sf, incomplete)

def trans_spliced(f):
	""" Return whether this feature is trans-spliced.
	"""
	return "exception" in f.qualifiers and "trans-splicing" in f.qualifiers["exception"]

def incomplete_cds(f, translations):
	""" Return whether this coding feature is incomplete.
	"""
	for sf in subfeatures(f):
		if sf.type == 'mRNA':
			if not 'ID' in sf.qualifiers:
				continue
			name = sf.qualifiers["Name"][0]
			id = sf.qualifiers["ID"][0]
			seq = translations[id]
			# ACG codes for T and is edited to AUG, which codes for M.
			acg_start = seq[0] == "T" \
				and "exception" in sf.qualifiers and "note" in sf.qualifiers \
				and "RNA editing" in sf.qualifiers["exception"] \
				and "putative start codon created by RNA editing" in sf.qualifiers["note"]
			# GCG codes for A and is edited to GUG, which codes for V, an alternative start codon.
			gcg_start = seq[0] == "A" \
				and "exception" in sf.qualifiers and "note" in sf.qualifiers \
				and "RNA editing" in sf.qualifiers["exception"] \
				and "putative alternative GUG start codon created by RNA editing" in sf.qualifiers["note"]
			# matR starts with a GGG codon, which codes for G.
			ggg_start = seq[0] == "G" and name == "matR"
			# rpl16 starts with a GUG codon, which codes for V.
			gtg_start = seq[0] == "V" and name == "rpl16"
			# CAG codes for Q and is edited to UAG, which is a stop codon.
			cag_stop = seq[-1] == "Q" \
				and "exception" in sf.qualifiers and "note" in sf.qualifiers \
				and "RNA editing" in sf.qualifiers["exception"] \
				and "putative stop codon created by RNA editing" in sf.qualifiers["note"]
			good_start = seq[0] == "M" or acg_start or gcg_start or ggg_start or gtg_start
			good_stop = seq[-1] == "*" or cag_stop
			return (not good_start, not good_stop)
	return (False, False)

def print_gene(f, gene_product_table, translations):
	""" Print a gene feature.
	"""
	incomplete = incomplete_cds(f, translations)
	if trans_spliced(f):
		# Print the coordinates of the trans-spliced mRNA.
		count = 0
		for sf in subfeatures(f):
			count += 1
			if count == 1:
				print_coord_override_type(sf, 'gene', incomplete)
			else:
				print_coord(sf, incomplete)
	else:
		print_coord_type(f, incomplete)

	name = re.sub("-gene$", "", f.qualifiers['Name'][0])
	gene = re.sub("[-_][0-9]+$", "", re.sub(r"\|.*$", "", name))
	locus_tag = "%s_%s" % (args.locustag, f.qualifiers['ID'][0])

	if gene.startswith("orf"):
		product = "hypothetical protein"
	elif gene in gene_product_table:
		product = gene_product_table[gene]
	elif gene.startswith("trn") and gene[0:4] in gene_product_table:
		product = gene_product_table[gene[0:4]]
	elif gene.startswith("ymf") and gene[3:] in gene_product_table:
		product = "putative " + gene_product_table[gene[3:]]
	else:
		print >>sys.stderr, "Error: gene", gene, "not found in", args.gene_product_file
		sys.exit(1)

	print "\t\t\tgene\t%s" % gene
	print "\t\t\tlocus_tag\t%s" % locus_tag
	if 'exception' in f.qualifiers:
		print "\t\t\texception\t%s" % f.qualifiers['exception'][0]
	if 'pseudo' in f.qualifiers:
		print "\t\t\tpseudo"

	for sf in subfeatures(f):
		if sf.type == 'mRNA':
			print_mrna(sf, incomplete, locus_tag, product)
			print_introns(sf)
		elif sf.type == 'rRNA':
			print_coord_type(sf, incomplete)
			size = name.partition('rrn')[2]
			print "\t\t\tproduct\t%s" % product
			print_introns(sf)
		elif sf.type == 'tRNA':
			print_trna(sf, locus_tag, product)
			print_introns(sf)

def main(args):
	gene_product_table = {}
	with open(args.gene_product_file) as tsvfile:
		reader = csv.reader(tsvfile, dialect = "excel-tab")
		header = reader.next()
		assert header == ["gene", "product"]
		for row in reader:
			gene_product_table[row[0]] = row[1]

	translations = {}
	fasta_sequences = SeqIO.parse(open(args.translation_file), 'fasta')
	for record in fasta_sequences:
		name, sequence = record.id, record.seq.tostring()
		translations[name] = sequence

	for rec in GFF.parse(args.gff_file):
		print ">Feature %s" % rec.id
		seen = {}
		for f in rec.features:
			if f.type != 'gene':
				continue
			if trans_spliced(f):
				# Skip the second copy of the trans-spliced gene.
				child = f.sub_features[0].qualifiers['ID'][0]
				if child in seen:
					continue
				seen[child] = True
			print_gene(f, gene_product_table, translations)

parser = argparse.ArgumentParser(description="Convert GFFv3 to Sequin TBL")
parser.add_argument("--centre", help="Sequencing centre ID")
parser.add_argument("--locustag", help="Locus tag prefix")
parser.add_argument("gff_file", metavar="project.gff", help="GFF file of annotations")
parser.add_argument("gene_product_file", metavar="product.tsv", help="TSV of gene products")
parser.add_argument("translation_file", metavar="protein.fa", help="FASTA of protein sequences")
args = parser.parse_args()

if __name__ == "__main__":
	main(args)