#!/usr/bin/env python3

#This tool was developed for the purposes of the Vertebrate
#Genomes Project. Redistribution and use in source and binary forms,
#with or without modification, are permitted provided that the
#following conditions are met:
#
#1. Redistributions of source code must retain the above copyright
#notice, this list of conditions and the following disclaimer.
#
#2. Redistributions in binary form must reproduce the above copyright
#notice, this list of conditions and the following disclaimer in the
#documentation and/or other materials provided with the distribution.
#
#3. Neither the name of the copyright holder nor the names of its
#contributors may be used to endorse or promote products derived from
#this software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
#HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import csv
from pickle import TRUE
import sys
import pandas as pd
import argparse

csv.field_size_limit(sys.maxsize)


## Arguments/inputs ----

parser = argparse.ArgumentParser(add_help=TRUE)
parser.add_argument("--blastout", help="output from submit_mito_blast2.sh; format 6 csv table generated by blasting the assembly against the mitochondrial database")
args = parser.parse_args()


## Functions ----

def readfile(infile, dfname):
    # Open tabular format output from blast of scaffolds against NCBI mitochondrial db
    tabfile = []
    with open(infile,'r') as file:
        file = csv.reader(file, delimiter = '\t')
        for line in file: 
            tabfile.append(line)

    ## put in some sort of ... if sum alignment lengths < total scaff length, do not bother checking for coverage 
    dfname = pd.DataFrame(tabfile, columns = ['qseqid', 'sseqid', 'length', 'qstart', 'qend', 'evalue' , 'qlen', 'qcovs','qcovhsp'])
    return (dfname)

def calccov(df):
    highCovReportLines = []
    highCovScaffs = set()

    ## list of all the unique scaffolds named in the blast output 
    uniqScaffs = df.qseqid.unique()

    for uniqScaff in uniqScaffs:

        rows = df.loc[df['qseqid'] == uniqScaff] ## isolate rows with a particular scaffold name 

        uniqAccs = rows.sseqid.unique()

        totalScaffLength = (list(rows['qlen']))[0]

        for uniqAcc in uniqAccs:

            rowsAcc = rows.loc[rows['sseqid'] == uniqAcc]

            ## making a list of all the alignment start and end positions 
            starts = list(rowsAcc['qstart'].astype('int')) 
            ends = list(rowsAcc['qend'].astype('int'))

            startsEndsDf = pd.DataFrame(list(zip(starts,ends)),
                                        columns =['starts','ends'])

            startsEndsDfSort = startsEndsDf.sort_values('starts')

            startsSort = list(startsEndsDfSort['starts'])
            endsSort = list(startsEndsDfSort['ends'])

            coverage = 0
            currentpos = 0
            for i in range(len(startsSort)):
                alignLength = endsSort[i] - startsSort[i]
                if startsSort[i] > currentpos:
                    coverage += alignLength
                    currentpos = endsSort[i]
                elif (startsSort[i] < currentpos) and (endsSort[i] > currentpos):
                    coverage += (endsSort[i] - currentpos)
                    currentpos = endsSort[i]
        
            percentCov = (coverage/int(totalScaffLength))*100
            
            if percentCov > 95: 
                highCovReport = [uniqScaff,uniqAcc,totalScaffLength,coverage,percentCov]
                highCovReportLines.append(highCovReport)
                highCovScaffs.add(uniqScaff + "\n")

    return (highCovReportLines, highCovScaffs)
    
## Calculating coverage ----

blastdf = readfile(args.blastout, dfname = "blastdf")
highCovReportLines = calccov(blastdf)

## Writing the output to a csv report. 
highCovReports = (pd.DataFrame(highCovReportLines[0], columns=['Scaffold','Acc_number','Scaffold_len','Scaffold_align_cov','Perc_align_cov'])).sort_values('Perc_align_cov', ascending = False)
## just pass a file name for now - can rig something better later. 
highCovReports.to_csv("cov_report.tsv", sep="\t")


with open('mito_scaff_names.txt', 'w') as f:
    [f.write(scaff) for scaff in highCovReportLines[1]]
    f.close()