#!/usr/bin/env python3 #This tool was developed for the purposes of the Vertebrate #Genomes Project. Redistribution and use in source and binary forms, #with or without modification, are permitted provided that the #following conditions are met: # #1. Redistributions of source code must retain the above copyright #notice, this list of conditions and the following disclaimer. # #2. Redistributions in binary form must reproduce the above copyright #notice, this list of conditions and the following disclaimer in the #documentation and/or other materials provided with the distribution. # #3. Neither the name of the copyright holder nor the names of its #contributors may be used to endorse or promote products derived from #this software without specific prior written permission. # #THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT #LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR #A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT #HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, #SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT #LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import csv from pickle import TRUE import sys import pandas as pd import argparse csv.field_size_limit(sys.maxsize) ## Arguments/inputs ---- parser = argparse.ArgumentParser(add_help=TRUE) parser.add_argument("--blastout", help="output from submit_mito_blast2.sh; format 6 csv table generated by blasting the assembly against the mitochondrial database") args = parser.parse_args() ## Functions ---- def readfile(infile, dfname): # Open tabular format output from blast of scaffolds against NCBI mitochondrial db tabfile = [] with open(infile,'r') as file: file = csv.reader(file, delimiter = '\t') for line in file: tabfile.append(line) ## put in some sort of ... if sum alignment lengths < total scaff length, do not bother checking for coverage dfname = pd.DataFrame(tabfile, columns = ['qseqid', 'sseqid', 'length', 'qstart', 'qend', 'evalue' , 'qlen', 'qcovs','qcovhsp']) return (dfname) def calccov(df): highCovReportLines = [] highCovScaffs = set() ## list of all the unique scaffolds named in the blast output uniqScaffs = df.qseqid.unique() for uniqScaff in uniqScaffs: rows = df.loc[df['qseqid'] == uniqScaff] ## isolate rows with a particular scaffold name uniqAccs = rows.sseqid.unique() totalScaffLength = (list(rows['qlen']))[0] for uniqAcc in uniqAccs: rowsAcc = rows.loc[rows['sseqid'] == uniqAcc] ## making a list of all the alignment start and end positions starts = list(rowsAcc['qstart'].astype('int')) ends = list(rowsAcc['qend'].astype('int')) startsEndsDf = pd.DataFrame(list(zip(starts,ends)), columns =['starts','ends']) startsEndsDfSort = startsEndsDf.sort_values('starts') startsSort = list(startsEndsDfSort['starts']) endsSort = list(startsEndsDfSort['ends']) coverage = 0 currentpos = 0 for i in range(len(startsSort)): alignLength = endsSort[i] - startsSort[i] if startsSort[i] > currentpos: coverage += alignLength currentpos = endsSort[i] elif (startsSort[i] < currentpos) and (endsSort[i] > currentpos): coverage += (endsSort[i] - currentpos) currentpos = endsSort[i] percentCov = (coverage/int(totalScaffLength))*100 if percentCov > 95: highCovReport = [uniqScaff,uniqAcc,totalScaffLength,coverage,percentCov] highCovReportLines.append(highCovReport) highCovScaffs.add(uniqScaff + "\n") return (highCovReportLines, highCovScaffs) ## Calculating coverage ---- blastdf = readfile(args.blastout, dfname = "blastdf") highCovReportLines = calccov(blastdf) ## Writing the output to a csv report. highCovReports = (pd.DataFrame(highCovReportLines[0], columns=['Scaffold','Acc_number','Scaffold_len','Scaffold_align_cov','Perc_align_cov'])).sort_values('Perc_align_cov', ascending = False) ## just pass a file name for now - can rig something better later. highCovReports.to_csv("cov_report.tsv", sep="\t") with open('mito_scaff_names.txt', 'w') as f: [f.write(scaff) for scaff in highCovReportLines[1]] f.close()