#!/usr/bin/env perl ############################################################################### # # annotateM # # The idea here is an automated way of annotating your genome based on # multiple available databases and to produce a tab-delimited file of # all the annotations, evalues, scores, descriptions. # # Suggested workflow: # 1) run your genome nucleotide fasta file through annotateM # 2) then run post_annotateM to include the contig id,orf_start and end # 3) generate a tab-delimited file # 4) open the file in ms excel or oo calc # 5) manually curate the annotations based on evalues/scores/desc etc # 6) metabolic reconstruction of organism # # Copyright (C) Mohamed Fauzi Haroon # Special appearance from Adam Skarshewski # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### #pragmas use strict; use warnings; #core Perl modules use Getopt::Long; use Carp; use Data::Dumper; #CPAN modules #locally-written modules BEGIN { select(STDERR); $| = 1; select(STDOUT); $| = 1; } # edit here to log all external commands my $global_log_commands = 0; # ext command failure levels use constant { IGNORE_FAILURE => 0, WARN_ON_FAILURE => 1, DIE_ON_FAILURE => 2 }; # get input params and print copyright printAtStart(); my $global_options = checkParams(); # Database paths my $img_protein_database = '/srv/db/img/4.1/dereplicated/img_dereplicated_species.genes.faa' ###################################################################### # CODE HERE ###################################################################### # check that the file exists checkFileExists($global_options->{'in'}); # run prokka to generate the ORFs and also prokka annotations if (! -e "./prokka_annotation/") { print "Running prokka v1.8\n"; checkAndRunCommand("prokka", [{ "--locustag" => $global_options->{'locustag'}, "--outdir" => "prokka_annotation", "--prefix" => $global_options->{'locustag'}, "--kingdom" => $global_options->{'kingdom'}, "--cpus" => $global_options->{'threads'}, $global_options->{'in'}, }], DIE_ON_FAILURE); } # identify the ORF called amino acid fasta file my $locus = $global_options->{'locustag'}; # blast against img if (! -e "./$locus.faaVSimg.blastp") { print "BLASTing against IMG 4.1 database...............\n"; checkAndRunCommand("cat", [[ "prokka_annotation/$locus.faa |", "parallel", "-j" => $global_options->{'threads'}, "--block"=> "100k", "--recstart", "'>'", "--pipe", "blastp", -db => $img_protein_database, -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, -query => "-", "> $locus.faaVSimg.blastp", ]], DIE_ON_FAILURE); } # reciprocal blast of img positive hits against genome ORF if (! -e "./subsetimg.faaVS$locus.faa.blastp") { print "Reciprocal BLASTing positive IMG hits to $locus.faa ...............\n"; checkAndRunCommand("contig_extractor.pl", [[ -i => "$locus.faaVSimg.blastp", -d => $img_protein_database, -b => '', -S => '', -o => "subsetimg.faa", ]], DIE_ON_FAILURE); checkAndRunCommand("makeblastdb", [[ -in => "prokka_annotation/$locus.faa", -dbtype => "prot", ]], DIE_ON_FAILURE); checkAndRunCommand("blastp", [[ -query => "subsetimg.faa", -db => "prokka_annotation/$locus.faa", -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, -num_threads => $global_options->{'threads'}, -out => "subsetimg.faaVS$locus.faa.blastp", ]], DIE_ON_FAILURE); } # blast against uniref if (! -e "./$locus.faaVSuniref90.blastp") { print "BLASTing against latest Uniref90 April2014 database ................\n"; checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", "-j" => $global_options->{'threads'}, "--block"=> "100k", "--recstart", "'>'", "--pipe", "blastp", -db => "/srv/db/uniprot/uniref-20140403/uniref90.fasta", -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, -query => "-", "> $locus.faaVSuniref90.blastp", #-num_threads => $global_options->{'threads'}, ]], DIE_ON_FAILURE); } # reciprocal blast of Uniref positive hits against genome ORF if (! -e "./subsetuniref.faaVS$locus.faa.blastp") { print "Reciprocal BLASTing positive Uniref hits to $locus.faa ...............\n"; checkAndRunCommand("contig_extractor.pl", [[ -i => "$locus.faaVSuniref90.blastp", -d => "/srv/db/uniprot/uniref-20140403/uniref90.fasta", -b => '', -S => '', -o => "subsetuniref.faa", ]], DIE_ON_FAILURE); checkAndRunCommand("blastp", [[ -query => "subsetuniref.faa", -db => "prokka_annotation/$locus.faa", -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, -num_threads => $global_options->{'threads'}, -out => "subsetuniref.faaVS$locus.faa.blastp", ]], DIE_ON_FAILURE); } # blast against COG if (! -e "./$locus.faaVSCOG.blastp") { print "BLASTing against the one and only COG database................\n"; checkAndRunCommand("cat",[[ "prokka_annotation/$locus.faa |", "parallel", "-j" => $global_options->{'threads'}, "--block"=> "100k", "--recstart", "'>'", "--pipe", "blastp", -db => "/srv/db/cog/cog_blast_prot_db", -outfmt => 6, -max_target_seqs => 1, -evalue => $global_options->{'evalue'}, -query => "-", "> $locus.faaVSCOG.blastp", #-num_threads => $global_options->{'threads'}, ]], DIE_ON_FAILURE); } # HMMSCAN against PFAM if (! -e "./$locus.faaVSPfam-A.hmm.hmmscanned") { print "HMMscanning against latest Pfam 27 database................\n"; checkAndRunCommand("pfam_scan.pl",[[ -cpu => $global_options->{'threads'}, -e_seq => $global_options->{'evalue'}, -outfile => "$locus.faaVSPfam-A.hmm.hmmscanned", -fasta => "prokka_annotation/$locus.faa", -dir => "/srv/db/pfam/27", ]], DIE_ON_FAILURE); } # HMMSCAN against TIGRfam if (! -e "./$locus.faaVStigr_all.hmm.hmmscanned") { print "HMMscanning against TIGRfam April2014 database................\n"; checkAndRunCommand("hmmscan",[[ "--tblout", "$locus.faaVStigr_all.hmm.hmmscanned", "--noali", -E => $global_options->{'evalue'}, "--cpu", $global_options->{'threads'}, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_HMM/tigr_all.hmm", "prokka_annotation/$locus.faa", ]], DIE_ON_FAILURE); } # convert the hmmscan output to tab delimited checkAndRunCommand("awk",[[ "'{\$1=\$1}{ print }'", "$locus.faaVSPfam-A.hmm.hmmscanned", "| sed 's/\\s/\\t/g'", "> $locus.faaVSPfam-A.hmm.hmmscanned.tab", ]], DIE_ON_FAILURE); checkAndRunCommand("awk",[[ "'{\$1=\$1}{ print }'", "$locus.faaVStigr_all.hmm.hmmscanned", "| sed 's/^\\s+//'", "| sed 's/\\s+\$//'", "| sed 's/\\s/\\t/g'", "> $locus.faaVStigr_all.hmm.hmmscanned.tab", ]], DIE_ON_FAILURE); # declare hashes for img my %access2imgid=(); my %img2reciprocal = (); my %imghash2 =(); #my @orfid = (); # read the img blast output and store in hash # SAMPLE img blast output - # phycis_04080 649633083|649978419 38.08 1116 640 14 13 1099 1 1094 0.0 663 # phycis_04081 649633083|649980044 28.40 405 237 10 49 422 20 402 3e-27 119 # phycis_04082 649633030|649661236 42.86 259 144 3 1 256 1 258 1e-61 205 # phycis_04083 640753047|640896165 61.55 1186 444 3 1 1177 1 1183 0.0 1504 # columns[0] = orfid # columns[1] = imgid # columns[10] = evalue # columns[11] = blast score open my $IMGblast, "./$locus.faaVSimg.blastp", or die "Couldn't open file $locus.faaVSimg.blastp\n"; while (<$IMGblast>) { chomp $_; my @columns = split (/\t/, $_); # push @orfid, $columns[0]; if ($columns[11] > 60) { # push @orfid, $columns[0]; #store access2imgid hash with the imgid key and point towards the orfid and value is the output i want printed out later $access2imgid{$columns[1]}->{$columns[0]} = "$columns[1]\t$columns[0]\t$columns[10]\t$columns[11]"; } } #print Dumper (\%access2imgid); # read img id2names.txt which is the file to get the gene identity of the imgid # SAMPLE img id2names.txt file - # 650716001|650846201 Ahos_0001 replication initiator protein Cdc6-3 Acidianus hospitalis W1 # 650716001|650846202 Ahos_0002 hypothetical protein Acidianus hospitalis W1 # 650716001|650846203 Ahos_0003 transcriptional coactivator/pterin dehydratase Acidianus hospitalis W1 # 650716001|650846204 Ahos_0004 GGCT (gamma glutamyl cyclotransferase) domain-containing protein Acidianus hospitalis W1 # columns[0] = imgid # columns[1] = gene name # columns[2] = organism open my $imgid2names, "/srv/db/img/4.1/blastdbs/img4.1_id2names.txt", or die "Couldn't open img4.1_id2names.txt\n"; open my $img_temp_OUT, ">img_output_temp.txt"; while (<$imgid2names>) { chomp $_; my @columns = split (/\t/, $_); if (exists $access2imgid{$columns[0]}) { foreach my $orfid (keys $access2imgid{$columns[0]}) { #print "$orfid\n"; $img2reciprocal{$columns[0]} = "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]"; print {$img_temp_OUT} "$access2imgid{$columns[0]}{$orfid}\t$columns[1]\t$columns[2]\n"; #$img2reciprocal{$columns[0]} = "$orfid\t$columns[1]\t$columns[2]"; #print {$img_temp_OUT} "$orfid\t$columns[1]\t$columns[2]\n"; #print Dumper (\%access2imgid); } } } #print Dumper (\%access2imgid); close($IMGblast); close($imgid2names); close($img_temp_OUT); # read my reciprocal img blast output and store in hash # SAMPLE # 2513020047|2513221347 phycis_01043 39.30 285 168 4 21 301 21 304 2e-51 172 # 648028035|648160186 phycis_03502 40.55 217 122 4 7 221 422 633 1e-48 167 # 639633053|639783588 phycis_00179 49.23 260 121 4 14 269 20 272 3e-80 246 # 639633064|639773205 phycis_02647 29.24 383 234 11 8 370 3 368 2e-45 160 open my $rIMGblast, "./subsetimg.faaVS$locus.faa.blastp", or die "Couldn't open file subsetimg.faaVS$locus.faa.blastp\n"; open my $img_temp_OUT2, ">img_output_temp2.txt"; while (<$rIMGblast>) { chomp $_; my @columns = split (/\t/, $_); if (exists $img2reciprocal{$columns[0]}) { print {$img_temp_OUT2} $img2reciprocal{$columns[0]} . "\treciprocal\n"; } else { print {$img_temp_OUT2} "$columns[0]\t$columns[1]\tNA\tNA\tNA\tNA\tNOT reciprocal\n"; } } close($img_temp_OUT2); # hashes for uniref my %hash4 = (); my %hash5 =(); my %hash6 = (); # read uniref blast and store in hash open my $unirefblast, "./$locus.faaVSuniref90.blastp", or die "Couldn't open file $locus.faaVSuniref90.blastp\n"; while (<$unirefblast>) { chomp $_; my @columns = split (/\t/, $_); if ($columns[11] > 60) { $hash4{$columns[0]} = $columns[1]; $hash4{$columns[1]} = $columns[0]; $hash5{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; } } # read uniref id2names.txt open my $unirefid2names, "/srv/db/uniprot/uniref-20140403/uniref90_id2names.txt", or die "Couldn't open id2names.txt\n"; open my $uniref_temp_OUT, ">uniref_output_temp.txt"; while (<$unirefid2names>) { chomp $_; my @columns = split (/\t/, $_); if (exists $hash5{$columns[0]}) { $hash6{$columns[0]} = "$hash5{$columns[0]}\t$columns[1]\t$columns[2]"; print {$uniref_temp_OUT} "$hash5{$columns[0]}\t$columns[1]\t$columns[2]\n"; } } close($unirefblast); close($unirefid2names); close($uniref_temp_OUT); # read my reciprocal img blast output and store in hash open my $runirefblast, "./subsetuniref.faaVS$locus.faa.blastp", or die "Couldn't open file subsetuniref.faaVS$locus.faa.blastp\n"; open my $uniref_temp_OUT2, ">uniref_output_temp2.txt"; while (<$runirefblast>) { chomp $_; my @columns = split (/\t/, $_); if (exists $hash6{$columns[0]}) { print {$uniref_temp_OUT2} "$hash6{$columns[0]}\treciprocal\n"; } } close($uniref_temp_OUT2); # hashes for pfam my %hash7 = (); my %hash8 = (); # read pfam hmmscan output and store in hash open my $pfamoutput, "./$locus.faaVSPfam-A.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVSPfam-A.hmm.hmmscanned.tab\n"; while (<$pfamoutput>) { next if /^\s*(#.*)?$/; next if $pfamoutput =~ /^#/; next if $pfamoutput =~ /^=/; chomp $_; my @columns = split (/\t/, $_); if ($columns[11] > 60) { my @pfam_columns = split (/\./, $columns[5]); my $pfam_id = $pfam_columns[0]; $hash7{$columns[0]} = $pfam_columns[0]; $hash7{$pfam_columns[0]} = $columns[0]; $hash8{$pfam_columns[0]} = "$columns[0]\t$columns[12]\t$columns[11]"; } } # read Pfam-A.clans.tsv open my $pfamid2names, "/srv/db/pfam/27/Pfam-A.clans.tsv", or die "Couldn't open Pfam-A.clans.tsv\n"; open my $pfam_temp_OUT, ">pfam_output_temp.txt"; while (<$pfamid2names>) { chomp $_; my @columns = split (/\t/, $_); if (exists $hash8{$columns[0]}) { print {$pfam_temp_OUT} "$hash8{$columns[0]}\t$columns[4]\n"; } } close($pfamoutput); close($pfamid2names); close($pfam_temp_OUT); # hashes for tigrfam my %hash9 = (); my %hash10 = (); # read tigrfam hmmscan output and store in hash open my $tigrfamoutput, "./$locus.faaVStigr_all.hmm.hmmscanned.tab", or die "Couldn't open file $locus.faaVStigr_all.hmm.hmmscanned.tab\n"; while (<$tigrfamoutput>) { next if /^\s*(#.*)?$/; next if $tigrfamoutput =~ /^#/; chomp $_; my @columns = split (/\t/, $_); if ($columns[5] > 10) { $hash9{$columns[2]} = $columns[0]; $hash9{$columns[0]} = $columns[2]; $hash10{$columns[0]} = "$columns[2]\t$columns[4]\t$columns[5]"; } } # read tigrfam id2names2description open my $tigrfamid2names, "/srv/db/tigrfam/14.0/TIGRFAMs_14.0_INFO/tigr_info_combined.parsed_updated2", or die "Couldn't open tigr_info_combined.parsed_updated2\n"; open my $tigrfam_temp_OUT, ">tigrfam_output_temp.txt"; while (<$tigrfamid2names>) { chomp $_; my @columns = split (/\t/, $_); $columns[0] =~ s/^\s+|\s+$//g; #$columns[0] =~ s/^\s+//; #$columns[0] =~ s/\s+$//; if (exists $hash10{$columns[0]}) { print {$tigrfam_temp_OUT} "$hash10{$columns[0]}\t$columns[1]\t$columns[2]\n"; } } close($tigrfamoutput); close($tigrfamid2names); close($tigrfam_temp_OUT); # hashes for cog my %hash11 = (); my %hash12 = (); my %hash13 = (); # read cog blastp output and store in hash open my $cogblast, "./$locus.faaVSCOG.blastp", or die "Couldn't open file $locus.faaVSCOG.blastp\n"; while (<$cogblast>) { chomp $_; my @columns = split (/\t/, $_); if ($columns[11] > 60) { $hash11{$columns[0]} = $columns[1]; $hash11{$columns[1]} = $columns[0]; $hash12{$columns[1]} = "$columns[0]\t$columns[10]\t$columns[11]"; } } # read cog prot2COG.tab open my $cogid2names, "/srv/db/cog/prot2COG.tab", or die "Couldn't open prot2COG.tab\n"; open my $cog_temp_OUT, "> cog_output_temp.txt"; while (<$cogid2names>) { chomp $_; my @columns = split (/\t/, $_); if (exists $hash12{$columns[0]}) { $hash13{$columns[0]} = "$hash12{$columns[0]}\t$columns[1]"; $hash13{$columns[1]} = $hash12{$columns[0]}; print {$cog_temp_OUT} "$hash12{$columns[0]}\t$columns[1]\n"; } } close($cogblast); close($cogid2names); close($cog_temp_OUT); # read cog listcogs.txt open my $cogid2longernames, "/srv/db/cog/listcogs.txt", or die "Couldn't open listcogs.txt\n"; open my $cog_temp_OUT2, "> cog_output_temp2.txt"; while(<$cogid2longernames>) { chomp $_; my @columns = split (/\t/, $_); if (exists $hash13{$columns[5]}) { print {$cog_temp_OUT2} "$hash13{$columns[5]}\t$columns[3]\t$columns[4]\t$columns[6]\n"; } } close($cog_temp_OUT2); ### now to parse all the temporary files and combine into one tab-delimited-file # to store the IDs => DB => values/annotations my %combined_bighash =(); # open file for output open my $FINAL_OUTPUT, "> ./final_output.txt"; # print header print {$FINAL_OUTPUT} "ORF_ID\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; # img open my $img_annotation, "./img_output_temp2.txt", or die "Couldn't open img_output_temp2.txt\n"; while (<$img_annotation>) { chomp $_; my @columns = split (/\t/, $_); # my @baba = @columns[1..$#columns]; my @baba = @columns[2..$#columns]; #print "@baba \n"; #$combined_bighash{$columns[0]}->{'img'} = join("\t", @baba); push @{$combined_bighash{$columns[1]}->{'a-img'}}, join("\t", @baba); } # uniref open my $uniref_annotation, "./uniref_output_temp2.txt", or die "Couldn't open uniref_output_temp2.txt\n"; while (<$uniref_annotation>) { chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); } # prokka # need to parse faa file to give prokka id2names checkAndRunCommand("grep",[[ "'>'", "prokka_annotation/$locus.faa |", "sed", "'s/>//g' |", "sed", -e => "'s/ /\\t/'", "> prokka_temp_output.txt", ]], DIE_ON_FAILURE); # SAMPLE gff file ##gff-version 3 ##sequence-region contig_3875 1 10320 #contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 #contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 #contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 #open my $prokka_gff, "./prokka_annotation/$locus.gff", or die "Couldn't open $locus.gff\n"; #while (<$prokka_gff>) #{ # next if $prokka_gff =~ /^#/; # chomp $_; # my @main_columns = split (/\t/, $_); # $prokka_gff = my $ID =~ m/[ID\=](.*)[\;]/; # $prokka_gff = my $product =~ m/[product\=](.*)[\;]/; # print "$ID\t$product\n"; #} open my $prokka_annotation, "./prokka_temp_output.txt", or die "Couldn't open prokka_temp_output.txt\n"; while (<$prokka_annotation>) { chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'prokka'} = join("\t", @baba); push @{$combined_bighash{$columns[0]}->{'c-prokka'}}, join("\t", @baba); } # cog open my $cog_annotation, "./cog_output_temp2.txt", or die "Couldn't open cog_output_temp2.txt\n"; while (<$cog_annotation>) { chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'cog'} = join("\t", @baba); push @{$combined_bighash{$columns[0]}->{'d-cog'}}, join("\t", @baba); } # pfam open my $pfam_annotation, "./pfam_output_temp.txt", or die "Couldn't open pfam_output_temp.txt\n"; while (<$pfam_annotation>) { chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'pfam'} = join("\t", @baba); push @{$combined_bighash{$columns[0]}->{'e-pfam'}}, join("\t", @baba); } #print Dumper \%combined_bighash; # tigrfam open my $tigrfam_annotation, "./tigrfam_output_temp.txt", or die "Couldn't open tigrfam_output_temp.txt\n"; while (<$tigrfam_annotation>) { chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; #$combined_bighash{$columns[0]}->{'tigrfam'} = join("\t", @baba); push @{$combined_bighash{$columns[0]}->{'f-tigrfam'}}, join("\t", @baba); } # to print finally................. # assign key and value for number of expected columns for each annotation type, important for putting NA in missing annotation types my %column_lengths = ( 'a-img' => 5, 'b-uniref' => 5, 'c-prokka' => 1, 'd-cog' => 5, 'e-pfam' => 3, 'f-tigrfam' => 4, ); # print the orfids first foreach my $ID (sort(keys %combined_bighash)) { print {$FINAL_OUTPUT} "$ID\t"; foreach my $annotation_type (sort(keys %column_lengths)) { # if the annotation type does not exist, print NA in the columns depending on the %column_lengths hash values if (! exists $combined_bighash{$ID}->{$annotation_type}) { # cool way of printing a certain string multiple times based on the values in the hash print {$FINAL_OUTPUT} join("\t", ("NA",) x $column_lengths{$annotation_type}), "\t"; } # check the derefencing made with @{$combined_bighash{$columns[0]}->{'f-tigrfam'}} and so on.. # the derefencing allows the hash be converted into an array so that we can read the hash for the different types of annotation types elsif (ref($combined_bighash{$ID}->{$annotation_type}) eq 'ARRAY') { # place to store the columns in the values of the hash annotation types my @storage_array; foreach my $line (@{$combined_bighash{$ID}->{$annotation_type}}) { # each annotation types have different number of columns, so we need to split the columns first before # we can add in the extra values if lets say an orfid hits multiple pfam/cog/tigrfam values my @values = split("\t",$line); # cool and alternate way of doing columns[1] = values[1], and so on.., repetitively # what it basically means as long as the value i less than the number of columns in each annotation type # add +1 to the string $i and do the push below for (my $i = 0; $i <= $#values; $i++) { push @{$storage_array[$i]}, $values[$i]; } } #print Dumper(\@storage_array); # array to store the multiple hits in each column. eg. test0001 orfid hits multiple pfam values pf0008 & pf0010 # so we would like to have the values combined together in the same column delimited by a comma my @print_info_array; for (my $i = 0; $i < $column_lengths{$annotation_type}; $i++) { push @print_info_array, join("; ", @{$storage_array[$i]}); } #print Dumper(\@print_info_array); print {$FINAL_OUTPUT} join("\t", @print_info_array), "\t"; } else { print {$FINAL_OUTPUT} "$combined_bighash{$ID}{$annotation_type}\t"; } } print {$FINAL_OUTPUT} "\n"; } #close all files close($img_annotation); close($uniref_annotation); close($prokka_annotation); close($cog_annotation); close($pfam_annotation); close($tigrfam_annotation); close($FINAL_OUTPUT); ###################################################################### # CUSTOM SUBS ###################################################################### # who needs custom subs... ###################################################################### # TEMPLATE SUBS ###################################################################### # PARAMETERS sub checkParams { #----- # Do any and all options checking here... # my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "kingdom|k:s", "threads|t:s", "evalue|e:s"); my %options; # Add any other command line options, and the code to handle them # GetOptions( \%options, @standard_options ); # if no arguments supplied print the usage and exit # exec("pod2usage $0") if (0 == (keys (%options) )); # If the -help option is set, print the usage and exit # exec("pod2usage $0") if $options{'help'}; # Compulsory items #if(!exists $options{''} ) { printParamError (""); } if(!exists $options{'in'} ) { printParamError ("You MUST supply a fasta file"); } return \%options; } sub printParamError { #----- # What to do if there's something wrong with a parameter # my ($error) = @_; print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); } sub overrideDefault { #----- # Set and override default values for parameters # my ($default_value, $option_name) = @_; if(exists $global_options->{$option_name}) { return $global_options->{$option_name}; } return $default_value; } ##################################################################### # FILE IO sub openWrite { #----- # Open a file for writing # my ($fn) = @_; open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; return $fh; } sub openRead { #----- # Open a file for reading # my ($fn) = @_; open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; return $fh; } ###################################################################### # EXTERNAL COMMANDS # # checkAndRunCommand("ls", { # -a => "" # }, # WARN_ON_FAILURE); sub checkFileExists { #----- # Does a file exists? # my ($file) = @_; unless(-e $file) { croak "**ERROR: $0 : Cannot find:\n$file\n"; } } sub logExternalCommand { #----- # Log a command line command to the command line! # if(1 == $global_log_commands) { print $_[0], "\n"; } } sub isCommandInPath { #----- # Is this command in the path? # my ($cmd, $failure_type) = @_; if (system("which $cmd |> /dev/null")) { handleCommandFailure($cmd, $failure_type); } } sub runExternalCommand { #----- # Run a command line command on the command line! # my ($cmd) = @_; logExternalCommand($cmd); system($cmd); } sub checkAndRunCommand { #----- # Run external commands more sanelier # my ($cmd, $params, $failure_type) = @_; isCommandInPath($cmd, $failure_type); # join the parameters to the command my $param_str = join " ", map {formatParams($_)} @{$params}; my $cmd_str = $cmd . " " . $param_str; print "The command currently running:\t$cmd_str\n"; logExternalCommand($cmd_str); # make sure that all went well if (system($cmd_str)) { handleCommandFailure($cmd_str, $failure_type) } } sub formatParams { #--------- # Handles and formats the different ways of passing parameters to # checkAndRunCommand # my $ref = shift; if (ref($ref) eq "ARRAY") { return join(" ", @{$ref}); } elsif (ref($ref) eq "HASH") { return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); } croak 'The elements of the $params argument in checkAndRunCommand can ' . 'only contain references to arrays or hashes\n'; } sub handleCommandFailure { #----- # What to do when all goes bad! # my ($cmd, $failure_type) = @_; if (defined($failure_type)) { if ($failure_type == DIE_ON_FAILURE) { croak "**ERROR: $0 : " . $! . "\n"; } elsif ($failure_type == WARN_ON_FAILURE) { carp "**WARNING: $0 : " . $! . "\n"; } } } ###################################################################### # MISC sub printAtStart { print<<"EOF"; ---------------------------------------------------------------- $0 annotateM - annotate my genome Due to the blast processes against multiple databases, this whole annotation pipeline will usually take awhile. Please be patient! What you get in the end will save you heaps of time. ---------------------------------------------------------------- EOF } __DATA__ =head1 NAME annotateM =head1 COPYRIGHT Copyright (C) Mohamed Fauzi Haroon Special appearance from Adam Skarshewski This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =head1 DESCRIPTION Want to annotate your genome? annotateM! =head1 SYNOPSIS annotateM -i [fasta_file] -l [locus] -k [kingdom] -t [threads] -e [evalue] -i FASTA_FILE Nucleotide fasta file -l locustag Name of locus tag -k kingdom (Bacteria/Archaea/Phage/Viruses) Kingdom of genome to be annotated -t threads Number of threads -e evalue Evalue for BLAST, recommended 1e-3 [-help -h] Displays basic usage information =cut