#!/usr/bin/env perl use strict; use FindBin; use File::Basename; use Cwd qw(abs_path); use File::Spec; use Path::Tiny; use Data::Dumper; use List::MoreUtils qw(zip); use Cwd qw(abs_path); #.............................................................................. # Globals needed before --help etc my $VERSION = "1.4.0"; my $EXE = basename($0); my $AUTHOR = 'Torsten Seemann'; my $TWITTER = '@torstenseemann'; my $URL = 'https://github.com/tseemann/abricate'; my @OUTFMT = qw(tsv csv bed gff json); my %OUTFMT = (map { $_=>1 } @OUTFMT); #.............................................................................. # Command line options my(@Options, $debug, $quiet, $setupdb, $list, $summary, $identity, $check, $datadir, $db, $minid, $mincov, $threads, $noheader, $csv, $nopath, $fofn, $outfmt); setOptions(); #.............................................................................. # Globals my $OUTSEP = "\t"; my $ABSENT = '.'; my $FIELD = '%COVERAGE'; my $FIELDSEP = ';'; my $IDSEP = '~~~'; my $CULL = 1; # BLAST -culling_limit my @BLAST_FIELDS = qw( qseqid qstart qend qlen sseqid sstart send slen sstrand evalue length pident gaps gapopen stitle ); my @COLUMNS = qw( FILE SEQUENCE START END STRAND GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION PRODUCT RESISTANCE ); $COLUMNS[0] = '#'.$COLUMNS[0]; my @REQUIRE = qw(blastn blastx makeblastdb blastdbcmd any2fasta gzip unzip); #.............................................................................. # Option parsing $threads >= 1 or err("Invalid --threads option, must be 1 or higher"); ($minid > 0 and $minid <= 100) or err("--minid must be between 1 and 100"); ($mincov >= 0 and $mincov <= 100) or err("--mincov must be between 0 and 100"); $OUTSEP = ',' if $csv; # default is tab $OUTFMT{$outfmt} or err("Invalid --outfmt '$outfmt' (need: @OUTFMT"); if ($fofn) { -r $fofn or err("Could not open --fofn '$fofn'"); my @filenames = path($fofn)->lines( { chomp => 1 } ); @filenames or err("No files found in --fofn '$fofn'"); msg("Using files from --fofn and ignoring any provided on the command line."); @ARGV = @filenames; } if ($identity) { $FIELD = '%IDENTITY'; msg("Using %IDENTITY for the summary table instead of %COVERAGE"); } if ($summary) { @ARGV >= 1 or err("--summary needs >=1 abricate output files"); summary_table( @ARGV ); exit; } if ($check) { $debug=1; msg("Checking dependencies are installed:"); require_exe( @REQUIRE ); msg("OK."); exit; } # check if dependencies are installed require_exe( @REQUIRE ); # do these now that we know the deps are ok if ($list or $setupdb) { list_databases($datadir, $setupdb); exit; } # check if blast > 2.2.30 to support 'gaps' custom field for my $blast ('blastn', 'blastx') { my($version) = qx"$blast -version 2>&1"; $version =~ m/2.(\d+.\d+)\+/ or err("Could not parse $blast version from '$version'"); $1 >= 2.30 or err("You need to install BLAST+ 2.2.30 or higher"); } # check database exists $datadir = abs_path($datadir); my $db_path = "$datadir/$db/sequences"; my $dbinfo = blast_database_info($db_path); $dbinfo or err("BLAST database not found: $db_path"); msg("Using", $dbinfo->{DBTYPE}, "database $db: ", $dbinfo->{SEQUENCES}, "sequences - ", $dbinfo->{DATE}); # output format my $format = "6 @BLAST_FIELDS"; print line(@COLUMNS) unless $noheader; FILE: for my $file (@ARGV) { my %seen; my @hit; msg("Processing: $file"); -r $file or err("'$file' does not exist, or is unreadable"); my $blastcmd = $dbinfo->{DBTYPE} eq 'nucl' ? "blastn -task blastn -dust no -perc_identity $minid" : "blastx -task blastx-fast -seg no" ; my $cmd = "bash -c 'set -euo pipefail;" . " any2fasta -q -u \Q$file\E |" . " $blastcmd -db \Q$db_path\E -outfmt \"$format\" -num_threads $threads" . " -evalue 1E-20 -culling_limit $CULL" . " -max_target_seqs 10000" # https://github.com/tseemann/abricate/issues/135 # . " -max_target_seqs ".$dbinfo->{SEQUENCES} # Issue #76 . "'" ; msg("Running: $cmd") if $debug; open BLAST, "$cmd |"; while () { chomp; my @x = split m/\t/; @x == @BLAST_FIELDS or err("can not find sequence data in '$file'"); my %hit = (map { $BLAST_FIELDS[$_] => $x[$_] } 0 .. @BLAST_FIELDS-1); ($hit{sstart}, $hit{send}) = ($hit{send}, $hit{sstart}) if $hit{sstrand} eq 'minus'; next if $seen{ join('~', @hit{qw(qseqid qstart qend)}) }++; my $pccov = 100 * ($hit{length}-$hit{gaps}) / $hit{slen}; next unless $pccov >= $mincov; # $hit{sseqid} =~ s/_.*$//; my($database, $gene, $acc, $abx) = split m/$IDSEP/, $hit{sseqid}; # if it wasn't in the format we expected, try and be sensible if (!defined $gene and !defined $acc) { $acc = ''; $gene = $hit{sseqid}; $database = $db; # name specified with --db } msg( Dumper(\%hit) ) if $debug; my $product = $hit{'stitle'} || "n/a"; $product =~ s/[,\t]//g; # remove output separators # https://github.com/tseemann/abricate/issues/95 $product =~ s/^\S+\s+// if $product =~ m/$IDSEP/; my $minimap = minimap( @hit{qw(sstart send slen gapopen)} ); # $minimap .= '!-'.$hit{gaps} if $hit{gaps} > 0; push @hit, [ $nopath ? basename($file) : $file , $hit{qseqid}, $hit{qstart}, $hit{qend}, ($hit{sstrand} eq 'minus' ? '-' : '+'), $gene, #$hit{sseqid}, $hit{sstart}.'-'.$hit{send}.'/'.$hit{slen}, $minimap, $hit{gapopen}.'/'.$hit{gaps}, sprintf("%.2f", $pccov), sprintf("%.2f", $hit{pident}), $database, $acc, $product, $abx, ]; } close BLAST; # Check the exit status of BLAST handle and abort my $exitcode = $? >> 8; if ($exitcode != 0) { msg("Command exited with non-zero exit code $exitcode. Aborting Abricate."); exit($exitcode); } msg("Found", scalar(@hit), "genes in $file"); # Sort hits #0 1 2 3 4 5 #FILE CHR START END GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION # @hit = sort { $a->[4] cmp $b->[4] || $b->[8] <=> $a->[8] } @hit; @hit = sort { $a->[1] cmp $b->[1] || $a->[2] <=> $b->[2] } @hit; # results to stdout print line(@$_) for @hit; # Inspiration msg( "Tip:", motd() ); # 'message of the day' msg("Done."); } #---------------------------------------------------------------------- sub motd { my @motd = ( "it's important to realise $EXE uses DNA matching, not AA.", "$EXE can also find virulence factors; use --list to see all supported databases.", "remember that abricate is unable to find AMR-mediated SNPs.", "the --fofn option allows you to feed in a big list of files to run on.", "you can use the --summary option to combine reports in a presence/absence matrix.", "found a bug in $EXE? Post it at $URL/issues.", "have a suggestion for $EXE? Tell me at $URL/issues", "the $EXE manual is at $URL/blob/master/README.md", "did you know? $EXE was named after 'A'nti 'B'acterial 'R'esistiance", ); srand( (localtime)[0] ); # seed return $motd[ int(rand(scalar(@motd))) ] ; } #---------------------------------------------------------------------- sub minimap { my($x, $y, $L, $broken, $scale, $on, $off) = @_; my $WIDTH = 15 - ($broken ? 1 : 0); $broken ||= 0; $scale ||= ($L/$WIDTH); $on ||= '='; $off ||= '.'; $x = int( $x / $scale ); $y = int( $y / $scale ); $L = int( $L / $scale ); my $map=''; for my $i (0 .. $WIDTH-1) { # msg("$i $x $y $L $scale"); $map .= ($i >= $x and $i <= $y) ? $on : $off; $map .= '/' if $broken and $i==int($WIDTH/2); } return $map; } #---------------------------------------------------------------------- sub summary_table { my(@fname) = @_; # https://github.com/tseemann/abricate/issues/32 # if we have 1 file, we use the '#FILE' column as the key: "dutch mode" # if we have >1 files, we use the $fname[] name my $dutch_mode = (@fname == 1); my %gene; # genes observed my %data; my @hdr; for my $fname (@fname) { if (exists $data{$fname}) { wrn("Skipping duplicate file: $fname"); next; } # initialise to empty so it appears in report even if 0 genes found $data{$fname} = {} unless $dutch_mode;; msg("Parsing: $fname") if $debug; open my $fh, '<', $fname or err("Can't open '$fname' to summarize."); while (<$fh>) { chomp; my @col = split m/$OUTSEP/; msg("### [$.] @col") if $debug; @hdr = @col if ! @hdr; # first row we see will be header next if $col[0] =~ m/^#/; $col[0] = basename($col[0]) if $nopath; my $file = $dutch_mode ? $col[0] : $fname; my $row = { zip @hdr, @col }; #print STDERR Dumper($row)) if $debug; $gene{ $row->{'GENE'} }++; push @{ $data{ $file }{ $row->{'GENE'} } } , $row; } #wrn(Dumper(\%data)); } my @gene = sort { $a cmp $b } keys %gene; print line('#FILE', 'NUM_FOUND', @gene); for my $fname (sort { $a cmp $b } keys %data) { my @present = map { exists $data{$fname}{$_} ? join( $FIELDSEP, map { $_->{$FIELD} } @{$data{$fname}{$_}} ) : $ABSENT } @gene; my $label = $nopath ? basename($fname) : $fname;; print line( $label, scalar( keys %{$data{$fname}} ), @present ); } } #---------------------------------------------------------------------- sub blast_database_info { my($prefix) = @_; my(@res) = qx(blastdbcmd -info -db \Q$prefix\E 2> /dev/null); chomp @res; my $res = join(' ', @res); my $info = { PREFIX => $prefix }; $res =~ m/\b([\d,]+)\s+sequences;/ or return; $info->{SEQUENCES} = $1; $info->{SEQUENCES} =~ s/,//g; $res =~ m/\btotal\s+(\S+)/ or return; $info->{DBTYPE} = $1 eq 'residues' ? 'prot' : 'nucl'; $res =~ m/\bDatabase: (\S+)/ or return; $info->{TITLE} = $1; $res =~ m/\bDate: (\w+)\s+(\d+),\s+(\d{4})\s/ or return; $info->{DATE} = "$3-$1-$2"; # YYYY-MM-DD return $info; } #---------------------------------------------------------------------- # blastdbcmd -list ../db/bacmet2/ -list_outfmt "%f~~~%p~~~%t~~~%d~~~%l~~~%n~~~%U" # ../db/bacmet2/sequences~~~Nucleotide~~~bacmet2~~~Apr 6, 2018 1:02 PM~~~280935~~~744~~~629421 sub list_databases { my($from, $setup_too) = @_; my @dir = grep { $_->is_dir } path($from)->realpath->children; # all subfolders my $count=0; my @list; for my $d (@dir) { my $name = $d->basename; # gives the 'file' part of the pathname my $dbname = "$from/$name/sequences"; if (-r $dbname) { $setup_too and make_blast_database($dbname, $name); -r "$dbname.pin" || -r "$dbname.nin" or err("Database $name is not indexed, please try: abricate --setupdb"); my $info = blast_database_info( $dbname ); # save for end push @list, [ $name, $info->{SEQUENCES}, $info->{DBTYPE}, $info->{DATE} ]; } } if (@list) { # print to STDOUT print line("DATABASE", "SEQUENCES", "DBTYPE", "DATE"); print map { line(@$_) } @list; } else { msg("No databases found in $from"); } } #---------------------------------------------------------------------- sub make_blast_database { my($path, $name) = @_; -r $path or err("Can't read FASTA file: $path"); # guess if nucl or prot my @lines = grep { ! m/^>/ } path($path)->lines( { chomp=>1 } ); chomp @lines; my $letters = join '', @lines; my $dbtype = mol_type($letters); msg("Removing old BLAST indices for $path"); unlink <$path.[np]??>; msg("Making $dbtype BLAST index for $path"); system("makeblastdb -in '$path' -title '$name' -dbtype $dbtype -logfile /dev/null"); } #---------------------------------------------------------------------- sub mol_type { my($s) = @_; my $L = length($s); $s =~ s/[AGTC]//gi; my $M = length($s); #msg("mol_type: with_ATGC=$L without_AGTC=$M"); return $M > 0.5 * $L ? 'prot' : 'nucl'; } #---------------------------------------------------------------------- sub line { return join($OUTSEP, @_)."\n"; } #---------------------------------------------------------------------- sub version { print "$EXE $VERSION\n"; exit; } #---------------------------------------------------------------------- sub msg { print STDERR "@_\n" unless $quiet; } #---------------------------------------------------------------------- sub wrn { msg("WARNING:", @_); } #---------------------------------------------------------------------- sub err { print STDERR "ERROR: @_\n"; exit(1); } #---------------------------------------------------------------------- sub require_exe { my(@arg) = @_; for my $exe (@arg) { my $where = ''; for my $dir ( File::Spec->path ) { if (-x "$dir/$exe") { $where = "$dir/$exe"; last; } } if ($where) { msg("Found '$exe' => $where") if $debug; } else { err("Could not find '$exe'. Please install it and ensure it is in the PATH."); } } return; } #---------------------------------------------------------------------- # Option setting routines sub setOptions { use Getopt::Long; @Options = ( "GENERAL", {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, {OPT=>"debug!", VAR=>\$debug, DESC=>"Verbose debug output"}, {OPT=>"quiet!", VAR=>\$quiet, DESC=>"Quiet mode, no stderr output"}, {OPT=>"version!", VAR=>\&version, DESC=>"Print version and exit"}, {OPT=>"check!", VAR=>\$check, DESC=>"Check dependencies are installed"}, {OPT=>"threads=i", VAR=>\$threads, DEFAULT=>1, DESC=>"Use this many BLAST+ threads"}, {OPT=>"fofn=s", VAR=>\$fofn, DEFAULT=>'', DESC=>"Run on files listed in this file"}, "DATABASES", {OPT=>"setupdb!", VAR=>\$setupdb, DESC=>"Format all the BLAST databases"}, {OPT=>"list!", VAR=>\$list, DESC=>"List included databases"}, {OPT=>"datadir=s", VAR=>\$datadir, DEFAULT=>abs_path("$FindBin::RealBin/../db"), DESC=>"Databases folder"}, {OPT=>"db=s", VAR=>\$db, DEFAULT=>"ncbi", DESC=>"Database to use"}, "OUTPUT", # not implemented yet {OPT=>"outfmt=s", VAR=>\$outfmt, DEFAULT=>$OUTFMT[0], DESC=>"Output format: @OUTFMT"}, {OPT=>"noheader!", VAR=>\$noheader, DESC=>"Suppress column header row"}, {OPT=>"csv!", VAR=>\$csv, DESC=>"Output CSV instead of TSV"}, {OPT=>"nopath!", VAR=>\$nopath, DESC=>"Strip filename paths from FILE column"}, "FILTERING", {OPT=>"minid=f", VAR=>\$minid, DEFAULT=>80, DESC=>"Minimum DNA %identity"}, {OPT=>"mincov=f", VAR=>\$mincov, DEFAULT=>80, DESC=>"Minimum DNA %coverage"}, "MODE", {OPT=>"summary!", VAR=>\$summary, DESC=>"Summarize multiple reports into a table"}, {OPT=>"identity!", VAR=>\$identity, DESC=>"Use %IDENTITY instead of %COVERAGE in summary table"}, ); @ARGV or usage(1); &GetOptions(map {$_->{OPT}, $_->{VAR}} (grep { ref } @Options)) or exit(5); # Now setup default values. foreach (grep { ref } @Options) { if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { ${$_->{VAR}} = $_->{DEFAULT}; } } } sub usage { my($exitcode) = @_; $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref $exitcode ||= 0; select STDERR if $exitcode; # write to STDERR if exitcode is error print "SYNOPSIS\n Find and collate amplicons in assembled contigs\n"; print "AUTHOR\n $AUTHOR ($TWITTER)\n"; print "USAGE\n"; print " % $EXE --list\n"; print " % $EXE [options] > out.tab\n"; print " % $EXE [options] --fofn fileOfFilenames.txt > out.tab\n"; print " % $EXE --summary ... > summary.tab\n"; foreach (@Options) { if (!ref $_) { print $_,"\n"; next } # print text lines my $opt = $_->{OPT}; $opt =~ s/!$//; $opt =~ s/=s$/ [X]/; $opt =~ s/=i$/ [N]/; $opt =~ s/=f$/ [n.n]/; printf " --%-13s %s%s.\n",$opt,$_->{DESC}, defined($_->{DEFAULT}) ? " [$_->{DEFAULT}]" : ""; } print "DOCUMENTATION\n $URL\n"; exit($exitcode); } #----------------------------------------------------------------------