#!/usr/bin/env perl
###############################################################################
#
# annotateM
#
# The idea here is an automated way of annotating your genome based on
# multiple available databases and to produce a tab-delimited file of
# all the annotations, evalues, scores, descriptions.
#
# Suggested workflow:
# 1) run your genome nucleotide fasta file through annotateM
# 2) then run post_annotateM to include the contig id,orf_start and end
# 3) generate a tab-delimited file
# 4) open the file in ms excel or oo calc
# 5) manually curate the annotations based on evalues/scores/desc etc
# 6) metabolic reconstruction of organism
#
# Copyright (C) Mohamed Fauzi Haroon
# Special appearance from Adam Skarshewski
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
###############################################################################
#pragmas
use strict;
use warnings;
#core Perl modules
use Getopt::Long;
use Carp;
use Data::Dumper;
#CPAN modules
#locally-written modules
BEGIN {
select(STDERR);
$| = 1;
select(STDOUT);
$| = 1;
}
# edit here to log all external commands
my $global_log_commands = 0;
# ext command failure levels
use constant {
IGNORE_FAILURE => 0,
WARN_ON_FAILURE => 1,
DIE_ON_FAILURE => 2
};
# get input params and print copyright
printAtStart();
my $global_options = checkParams();
# Database paths
my $img_protein_database = '/srv/db/img/4.1/dereplicated/img_dereplicated_species.genes.faa'
######################################################################
# CODE HERE
######################################################################
# check that the file exists
checkFileExists($global_options->{'in'});
# run prokka to generate the ORFs and also prokka annotations
if (! -e "./prokka_annotation/")
{
print "Running prokka v1.8\n";
checkAndRunCommand("prokka", [{
"--locustag" => $global_options->{'locustag'},
"--outdir" => "prokka_annotation",
"--prefix" => $global_options->{'locustag'},
"--kingdom" => $global_options->{'kingdom'},
"--cpus" => $global_options->{'threads'},
$global_options->{'in'},
}], DIE_ON_FAILURE);
}
# identify the ORF called amino acid fasta file
my $locus = $global_options->{'locustag'};
# blast against img
if (! -e "./$locus.faaVSimg.blastp")
{
print "BLASTing against IMG 4.1 database...............\n";
checkAndRunCommand("cat",
[[
"prokka_annotation/$locus.faa |",
"parallel",
"-j" => $global_options->{'threads'},
"--block"=> "100k",
"--recstart",
"'>'",
"--pipe",
"blastp",
-db => $img_protein_database,
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
-query => "-",
"> $locus.faaVSimg.blastp",
]], DIE_ON_FAILURE);
}
# reciprocal blast of img positive hits against genome ORF
if (! -e "./subsetimg.faaVS$locus.faa.blastp")
{
print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n";
checkAndRunCommand("contig_extractor.pl",
[[
-i => "$locus.faaVSimg.blastp",
-d => $img_protein_database,
-b => '',
-S => '',
-o => "subsetimg.faa",
]], DIE_ON_FAILURE);
checkAndRunCommand("makeblastdb",
[[
-in => "prokka_annotation/$locus.faa",
-dbtype => "prot",
]], DIE_ON_FAILURE);
checkAndRunCommand("blastp",
[[
-query => "subsetimg.faa",
-db => "prokka_annotation/$locus.faa",
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
-num_threads => $global_options->{'threads'},
-out => "subsetimg.faaVS$locus.faa.blastp",
]], DIE_ON_FAILURE);
}
# blast against uniref
if (! -e "./$locus.faaVSuniref90.blastp")
{
print "BLASTing against latest Uniref90 April2014 database ................\n";
checkAndRunCommand("cat",[[
"prokka_annotation/$locus.faa |",
"parallel",
"-j" => $global_options->{'threads'},
"--block"=> "100k",
"--recstart",
"'>'",
"--pipe",
"blastp",
-db => "/srv/db/uniprot/uniref-20140403/uniref90.fasta",
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
-query => "-",
"> $locus.faaVSuniref90.blastp",
#-num_threads => $global_options->{'threads'},
]], DIE_ON_FAILURE);
}
# reciprocal blast of Uniref positive hits against genome ORF
if (! -e "./subsetuniref.faaVS$locus.faa.blastp")
{
print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n";
checkAndRunCommand("contig_extractor.pl",
[[
-i => "$locus.faaVSuniref90.blastp",
-d => "/srv/db/uniprot/uniref-20140403/uniref90.fasta",
-b => '',
-S => '',
-o => "subsetuniref.faa",
]], DIE_ON_FAILURE);
checkAndRunCommand("blastp",
[[
-query => "subsetuniref.faa",
-db => "prokka_annotation/$locus.faa",
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
-num_threads => $global_options->{'threads'},
-out => "subsetuniref.faaVS$locus.faa.blastp",
]], DIE_ON_FAILURE);
}
# blast against COG
if (! -e "./$locus.faaVSCOG.blastp")
{
print "BLASTing against the one and only COG database................\n";
checkAndRunCommand("cat",[[
"prokka_annotation/$locus.faa |",
"parallel",
"-j" => $global_options->{'threads'},
"--block"=> "100k",
"--recstart",
"'>'",
"--pipe",
"blastp",
-db => "/srv/db/cog/cog_blast_prot_db",
-outfmt => 6,
-max_target_seqs => 1,
-evalue => $global_options->{'evalue'},
-query => "-",
"> $locus.faaVSCOG.blastp",
#-num_threads => $global_options->{'threads'},
]], DIE_ON_FAILURE);
}
# HMMSCAN against PFAM
if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned")
{
print "HMMscanning against latest Pfam 27 database................\n";
checkAndRunCommand("pfam_scan.pl",[[
-cpu => $global_options->{'threads'},
-e_seq => $global_options->{'evalue'},
-outfile => "$locus.faaVSPfam-A.hmm.hmmscanned",
-fasta => "prokka_annotation/$locus.faa",
-dir => "/srv/db/pfam/27",
]], DIE_ON_FAILURE);
}
# HMMSCAN against TIGRfam
if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned")
{
print "HMMscanning against TIGRfam April2014 database................\n";
checkAndRunCommand("hmmscan",[[
"--tblout",
"$locus.faaVStigr_all.hmm.hmmscanned",
"--noali",
-E => $global_options->{'evalue'},
"--cpu",
$global_options->{'threads'},
"/srv/db/tigrfam/14.0/TIGRFAMs_14.0_HMM/tigr_all.hmm",
"prokka_annotation/$locus.faa",
]], DIE_ON_FAILURE);
}
# convert the hmmscan output to tab delimited
checkAndRunCommand("awk",[[
"'{\$1=\$1}{ print }'",
"$locus.faaVSPfam-A.hmm.hmmscanned",
"| sed 's/\\s/\\t/g'",
"> $locus.faaVSPfam-A.hmm.hmmscanned.tab",
]], DIE_ON_FAILURE);
checkAndRunCommand("awk",[[
"'{\$1=\$1}{ print }'",
"$locus.faaVStigr_all.hmm.hmmscanned",
"| sed 's/^\\s+//'",
"| sed 's/\\s+\$//'",
"| sed 's/\\s/\\t/g'",
"> $locus.faaVStigr_all.hmm.hmmscanned.tab",
]], DIE_ON_FAILURE);
# declare hashes for img
my %access2imgid=();
my %img2reciprocal = ();
my %imghash2 =();
#my @orfid = ();
# read the img blast output and store in hash
# SAMPLE img blast output -
# phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663
# phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119
# phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205
# phycis_04083 640753047|640896165 61.55 1186 444 3 1 1177 1 1183 0.0 1504
# columns[0] = orfid
# columns[1] = imgid
# columns[10] = evalue
# columns[11] = blast score
open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n";
while (<$IMGblast>)
{
chomp $_;
my @columns = split (/\t/, $_);
# push @orfid, $columns[0];
if ($columns[11] > 60)
{
# push @orfid, $columns[0];
#store access2imgid hash with the imgid key and point towards the orfid and value is the output i want printed out later
$access2imgid{$columns[1]}->{$columns[0]} = "$columns[1]\t$columns[0]\t$columns[10]\t$columns[11]";
}
}
#print Dumper (\%access2imgid);
# read img id2names.txt which is the file to get the gene identity of the imgid
# SAMPLE img id2names.txt file -
# 650716001|650846201 Ahos_0001 replication initiator protein Cdc6-3 Acidianus hospitalis W1
# 650716001|650846202 Ahos_0002 hypothetical protein Acidianus hospitalis W1
# 650716001|650846203 Ahos_0003 transcriptional coactivator/pterin dehydratase Acidianus hospitalis W1
# 650716001|650846204 Ahos_0004 GGCT (gamma glutamyl cyclotransferase) domain-containing protein Acidianus hospitalis W1
# columns[0] = imgid
# columns[1] = gene name
# columns[2] = organism
open my $imgid2names, "/srv/db/img/4.1/blastdbs/img4.1_id2names.txt", or die "Couldn't open img4.1_id2names.txt\n";
open my $img_temp_OUT, ">img_output_temp.txt";
while (<$imgid2names>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $access2imgid{$columns[0]})
{
foreach my $orfid (keys $access2imgid{$columns[0]})
{
#print "$orfid\n";
$img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]";
print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n";
#$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]";
#print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n";
#print Dumper (\%access2imgid);
}
}
}
#print Dumper (\%access2imgid);
close($IMGblast);
close($imgid2names);
close($img_temp_OUT);
# read my reciprocal img blast output and store in hash
# SAMPLE
# 2513020047|2513221347 phycis_01043 39.30 285 168 4 21 301 21 304 2e-51 172
# 648028035|648160186 phycis_03502 40.55 217 122 4 7 221 422 633 1e-48 167
# 639633053|639783588 phycis_00179 49.23 260 121 4 14 269 20 272 3e-80 246
# 639633064|639773205 phycis_02647 29.24 383 234 11 8 370 3 368 2e-45 160
open my $rIMGblast, "./subsetimg.faaVS$locus.faa.blastp", or die "Couldn't open file subsetimg.faaVS$locus.faa.blastp\n";
open my $img_temp_OUT2, ">img_output_temp2.txt";
while (<$rIMGblast>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $img2reciprocal{$columns[0]})
{
print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n";
}
else
{
print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n";
}
}
close($img_temp_OUT2);
# hashes for uniref
my %hash4 = ();
my %hash5 =();
my %hash6 = ();
# read uniref blast and store in hash
open my $unirefblast, "./$locus.faaVSuniref90.blastp", or die "Couldn't open file $locus.faaVSuniref90.blastp\n";
while (<$unirefblast>)
{
chomp $_;
my @columns = split (/\t/, $_);
if ($columns[11] > 60)
{
$hash4{$columns[0]} = $columns[1];
$hash4{$columns[1]} = $columns[0];
$hash5{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]";
}
}
# read uniref id2names.txt
open my $unirefid2names, "/srv/db/uniprot/uniref-20140403/uniref90_id2names.txt", or die "Couldn't open id2names.txt\n";
open my $uniref_temp_OUT, ">uniref_output_temp.txt";
while (<$unirefid2names>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $hash5{$columns[0]})
{
$hash6{$columns[0]} = "$hash5{$columns[0]}\t$columns[1]\t$columns[2]";
print {$uniref_temp_OUT} "$hash5{$columns[0]}\t$columns[1]\t$columns[2]\n";
}
}
close($unirefblast);
close($unirefid2names);
close($uniref_temp_OUT);
# read my reciprocal img blast output and store in hash
open my $runirefblast, "./subsetuniref.faaVS$locus.faa.blastp", or die "Couldn't open file subsetuniref.faaVS$locus.faa.blastp\n";
open my $uniref_temp_OUT2, ">uniref_output_temp2.txt";
while (<$runirefblast>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $hash6{$columns[0]})
{
print {$uniref_temp_OUT2} "$hash6{$columns[0]}\treciprocal\n";
}
}
close($uniref_temp_OUT2);
# hashes for pfam
my %hash7 = ();
my %hash8 = ();
# read pfam hmmscan output and store in hash
open my $pfamoutput, "./$locus.faaVSPfam-A.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVSPfam-A.hmm.hmmscanned.tab\n";
while (<$pfamoutput>)
{
next if /^\s*(#.*)?$/;
next if $pfamoutput =~ /^#/;
next if $pfamoutput =~ /^=/;
chomp $_;
my @columns = split (/\t/, $_);
if ($columns[11] > 60)
{
my @pfam_columns = split (/\./, $columns[5]);
my $pfam_id = $pfam_columns[0];
$hash7{$columns[0]} = $pfam_columns[0];
$hash7{$pfam_columns[0]} = $columns[0];
$hash8{$pfam_columns[0]} = "$columns[0]\t$columns[12]\t$columns[11]";
}
}
# read Pfam-A.clans.tsv
open my $pfamid2names, "/srv/db/pfam/27/Pfam-A.clans.tsv", or die "Couldn't open Pfam-A.clans.tsv\n";
open my $pfam_temp_OUT, ">pfam_output_temp.txt";
while (<$pfamid2names>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $hash8{$columns[0]})
{
print {$pfam_temp_OUT} "$hash8{$columns[0]}\t$columns[4]\n";
}
}
close($pfamoutput);
close($pfamid2names);
close($pfam_temp_OUT);
# hashes for tigrfam
my %hash9 = ();
my %hash10 = ();
# read tigrfam hmmscan output and store in hash
open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVStigr_all.hmm.hmmscanned.tab\n";
while (<$tigrfamoutput>)
{
next if /^\s*(#.*)?$/;
next if $tigrfamoutput =~ /^#/;
chomp $_;
my @columns = split (/\t/, $_);
if ($columns[5] > 10)
{
$hash9{$columns[2]} = $columns[0];
$hash9{$columns[0]} = $columns[2];
$hash10{$columns[0]} = "$columns[2]\t$columns[4]\t$columns[5]";
}
}
# read tigrfam id2names2description
open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed_updated2", or die "Couldn't open tigr_info_combined.parsed_updated2\n";
open my $tigrfam_temp_OUT, ">tigrfam_output_temp.txt";
while (<$tigrfamid2names>)
{
chomp $_;
my @columns = split (/\t/, $_);
$columns[0] =~ s/^\s+|\s+$//g;
#$columns[0] =~ s/^\s+//;
#$columns[0] =~ s/\s+$//;
if (exists $hash10{$columns[0]})
{
print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n";
}
}
close($tigrfamoutput);
close($tigrfamid2names);
close($tigrfam_temp_OUT);
# hashes for cog
my %hash11 = ();
my %hash12 = ();
my %hash13 = ();
# read cog blastp output and store in hash
open my $cogblast, "./$locus.faaVSCOG.blastp", or die "Couldn't open file $locus.faaVSCOG.blastp\n";
while (<$cogblast>)
{
chomp $_;
my @columns = split (/\t/, $_);
if ($columns[11] > 60)
{
$hash11{$columns[0]} = $columns[1];
$hash11{$columns[1]} = $columns[0];
$hash12{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]";
}
}
# read cog prot2COG.tab
open my $cogid2names, "/srv/db/cog/prot2COG.tab", or die "Couldn't open prot2COG.tab\n";
open my $cog_temp_OUT, "> cog_output_temp.txt";
while (<$cogid2names>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $hash12{$columns[0]})
{
$hash13{$columns[0]} = "$hash12{$columns[0]}\t$columns[1]";
$hash13{$columns[1]} = $hash12{$columns[0]};
print {$cog_temp_OUT} "$hash12{$columns[0]}\t$columns[1]\n";
}
}
close($cogblast);
close($cogid2names);
close($cog_temp_OUT);
# read cog listcogs.txt
open my $cogid2longernames, "/srv/db/cog/listcogs.txt", or die "Couldn't open listcogs.txt\n";
open my $cog_temp_OUT2, "> cog_output_temp2.txt";
while(<$cogid2longernames>)
{
chomp $_;
my @columns = split (/\t/, $_);
if (exists $hash13{$columns[5]})
{
print {$cog_temp_OUT2} "$hash13{$columns[5]}\t$columns[3]\t$columns[4]\t$columns[6]\n";
}
}
close($cog_temp_OUT2);
### now to parse all the temporary files and combine into one tab-delimited-file
# to store the IDs => DB => values/annotations
my %combined_bighash =();
# open file for output
open my $FINAL_OUTPUT, "> ./final_output.txt";
# print header
print {$FINAL_OUTPUT} "ORF_ID\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n";
# img
open my $img_annotation, "./img_output_temp2.txt", or die "Couldn't open img_output_temp2.txt\n";
while (<$img_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
# my @baba = @columns[1..$#columns];
my @baba = @columns[2..$#columns];
#print "@baba \n";
#$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba);
push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba);
}
# uniref
open my $uniref_annotation, "./uniref_output_temp2.txt", or die "Couldn't open uniref_output_temp2.txt\n";
while (<$uniref_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba);
push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba);
}
# prokka
# need to parse faa file to give prokka id2names
checkAndRunCommand("grep",[[
"'>'",
"prokka_annotation/$locus.faa |",
"sed",
"'s/>//g' |",
"sed",
-e => "'s/ /\\t/'",
"> prokka_temp_output.txt",
]], DIE_ON_FAILURE);
# SAMPLE gff file
##gff-version 3
##sequence-region contig_3875 1 10320
#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001
#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002
#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003
#open my $prokka_gff, "./prokka_annotation/$locus.gff", or die "Couldn't open $locus.gff\n";
#while (<$prokka_gff>)
#{
# next if $prokka_gff =~ /^#/;
# chomp $_;
# my @main_columns = split (/\t/, $_);
# $prokka_gff = my $ID =~ m/[ID\=](.*)[\;]/;
# $prokka_gff = my $product =~ m/[product\=](.*)[\;]/;
# print "$ID\t$product\n";
#}
open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n";
while (<$prokka_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'prokka'} = join("\t", @baba);
push @{$combined_bighash{$columns[0]}->{'c-prokka'}}, join("\t", @baba);
}
# cog
open my $cog_annotation, "./cog_output_temp2.txt", or die "Couldn't open cog_output_temp2.txt\n";
while (<$cog_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'cog'} = join("\t", @baba);
push @{$combined_bighash{$columns[0]}->{'d-cog'}}, join("\t", @baba);
}
# pfam
open my $pfam_annotation, "./pfam_output_temp.txt", or die "Couldn't open pfam_output_temp.txt\n";
while (<$pfam_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba);
push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba);
}
#print Dumper \%combined_bighash;
# tigrfam
open my $tigrfam_annotation, "./tigrfam_output_temp.txt", or die "Couldn't open tigrfam_output_temp.txt\n";
while (<$tigrfam_annotation>)
{
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
#$combined_bighash{$columns[0]}->{'tigrfam'} = join("\t", @baba);
push @{$combined_bighash{$columns[0]}->{'f-tigrfam'}}, join("\t", @baba);
}
# to print finally.................
# assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types
my %column_lengths = (
'a-img' => 5,
'b-uniref' => 5,
'c-prokka' => 1,
'd-cog' => 5,
'e-pfam' => 3,
'f-tigrfam' => 4,
);
# print the orfids first
foreach my $ID (sort(keys %combined_bighash))
{
print {$FINAL_OUTPUT} "$ID\t";
foreach my $annotation_type (sort(keys %column_lengths))
{
# if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values
if (! exists $combined_bighash{$ID}->{$annotation_type})
{
# cool way of printing a certain string multiple times based on the values in the hash
print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t";
}
# check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on..
# the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types
elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY')
{
# place to store the columns in the values of the hash annotation types
my @storage_array;
foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}})
{
# each annotation types have different number of columns, so we need to split the columns first before
# we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values
my @values = split("\t",$line);
# cool and alternate way of doing columns[1] = values[1], and so on.., repetitively
# what it basically means as long as the value i less than the number of columns in each annotation type
# add +1 to the string $i and do the push below
for (my $i = 0; $i <= $#values; $i++)
{
push @{$storage_array[$i]}, $values[$i];
}
}
#print Dumper(\@storage_array);
# array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010
# so we would like to have the values combined together in the same column delimited by a comma
my @print_info_array;
for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++)
{
push @print_info_array, join("; ", @{$storage_array[$i]});
}
#print Dumper(\@print_info_array);
print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t";
}
else
{
print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t";
}
}
print {$FINAL_OUTPUT} "\n";
}
#close all files
close($img_annotation);
close($uniref_annotation);
close($prokka_annotation);
close($cog_annotation);
close($pfam_annotation);
close($tigrfam_annotation);
close($FINAL_OUTPUT);
######################################################################
# CUSTOM SUBS
######################################################################
# who needs custom subs...
######################################################################
# TEMPLATE SUBS
######################################################################
# PARAMETERS
sub checkParams {
#-----
# Do any and all options checking here...
#
my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "kingdom|k:s", "threads|t:s", "evalue|e:s");
my %options;
# Add any other command line options, and the code to handle them
#
GetOptions( \%options, @standard_options );
# if no arguments supplied print the usage and exit
#
exec("pod2usage $0") if (0 == (keys (%options) ));
# If the -help option is set, print the usage and exit
#
exec("pod2usage $0") if $options{'help'};
# Compulsory items
#if(!exists $options{''} ) { printParamError (""); }
if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); }
return \%options;
}
sub printParamError
{
#-----
# What to do if there's something wrong with a parameter
#
my ($error) = @_;
print "**ERROR: $0 : $error\n"; exec("pod2usage $0");
}
sub overrideDefault
{
#-----
# Set and override default values for parameters
#
my ($default_value, $option_name) = @_;
if(exists $global_options->{$option_name})
{
return $global_options->{$option_name};
}
return $default_value;
}
#####################################################################
# FILE IO
sub openWrite
{
#-----
# Open a file for writing
#
my ($fn) = @_;
open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n";
return $fh;
}
sub openRead
{
#-----
# Open a file for reading
#
my ($fn) = @_;
open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n";
return $fh;
}
######################################################################
# EXTERNAL COMMANDS
#
# checkAndRunCommand("ls", {
# -a => ""
# },
# WARN_ON_FAILURE);
sub checkFileExists {
#-----
# Does a file exists?
#
my ($file) = @_;
unless(-e $file) {
croak "**ERROR: $0 : Cannot find:\n$file\n";
}
}
sub logExternalCommand
{
#-----
# Log a command line command to the command line!
#
if(1 == $global_log_commands) {
print $_[0], "\n";
}
}
sub isCommandInPath
{
#-----
# Is this command in the path?
#
my ($cmd, $failure_type) = @_;
if (system("which $cmd |> /dev/null")) {
handleCommandFailure($cmd, $failure_type);
}
}
sub runExternalCommand
{
#-----
# Run a command line command on the command line!
#
my ($cmd) = @_;
logExternalCommand($cmd);
system($cmd);
}
sub checkAndRunCommand
{
#-----
# Run external commands more sanelier
#
my ($cmd, $params, $failure_type) = @_;
isCommandInPath($cmd, $failure_type);
# join the parameters to the command
my $param_str = join " ", map {formatParams($_)} @{$params};
my $cmd_str = $cmd . " " . $param_str;
print "The command currently running:\t$cmd_str\n";
logExternalCommand($cmd_str);
# make sure that all went well
if (system($cmd_str)) {
handleCommandFailure($cmd_str, $failure_type)
}
}
sub formatParams {
#---------
# Handles and formats the different ways of passing parameters to
# checkAndRunCommand
#
my $ref = shift;
if (ref($ref) eq "ARRAY") {
return join(" ", @{$ref});
} elsif (ref($ref) eq "HASH") {
return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref});
}
croak 'The elements of the $params argument in checkAndRunCommand can ' .
'only contain references to arrays or hashes\n';
}
sub handleCommandFailure {
#-----
# What to do when all goes bad!
#
my ($cmd, $failure_type) = @_;
if (defined($failure_type)) {
if ($failure_type == DIE_ON_FAILURE) {
croak "**ERROR: $0 : " . $! . "\n";
} elsif ($failure_type == WARN_ON_FAILURE) {
carp "**WARNING: $0 : " . $! . "\n";
}
}
}
######################################################################
# MISC
sub printAtStart {
print<<"EOF";
----------------------------------------------------------------
$0
annotateM - annotate my genome
Due to the blast processes against multiple databases, this whole
annotation pipeline will usually take awhile. Please be patient!
What you get in the end will save you heaps of time.
----------------------------------------------------------------
EOF
}
__DATA__
=head1 NAME
annotateM
=head1 COPYRIGHT
Copyright (C) Mohamed Fauzi Haroon
Special appearance from Adam Skarshewski
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
=head1 DESCRIPTION
Want to annotate your genome? annotateM!
=head1 SYNOPSIS
annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue]
-i FASTA_FILE Nucleotide fasta file
-l locustag Name of locus tag
-k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated
-t threads Number of threads
-e evalue Evalue for BLAST, recommended 1e-3
[-help -h] Displays basic usage information
=cut