#!/usr/bin/env perl ############################################################################### # # post_annotateM # # The idea here is an automated way of annotating your genome based on # multiple available databases and to produce a tab-delimited file of # all the annotations, evalues, scores, descriptions. # # Suggested workflow: # 1) run your genome nucleotide fasta file through annotateM # 2) then run post_annotateM to include the contig id,orf_start and end # 3) generate a tab-delimited file # 4) open the file in ms excel or oo calc # 5) manually curate the annotations based on evalues/scores/desc etc # 6) metabolic reconstruction of organism # # Copyright (C) Mohamed Fauzi Haroon # Special appearance from Adam Skarshewski # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### #pragmas use strict; use warnings; #core Perl modules use Getopt::Long; use Carp; use Data::Dumper; #CPAN modules #locally-written modules BEGIN { select(STDERR); $| = 1; select(STDOUT); $| = 1; } # edit here to log all external commands my $global_log_commands = 0; # ext command failure levels use constant { IGNORE_FAILURE => 0, WARN_ON_FAILURE => 1, DIE_ON_FAILURE => 2 }; # get input params and print copyright printAtStart(); my $global_options = checkParams(); ###################################################################### # CODE HERE ###################################################################### # check that the final_output.txt file exists checkFileExists($global_options->{'in'}); # identify the ORF called amino acid fasta file my $locus = $global_options->{'locustag'}; # gff file my $gff_file = $global_options->{'gff'}; # to store hash my %hashy = (); my %orf = (); # open my final_output.txt file open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n"; while (<$final_output>) { #skips first line next if $. < 2; chomp $_; my @columns = split (/\t/, $_); my @baba = @columns[1..$#columns]; $hashy{$columns[0]} = join("\t", @baba); #$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba); # push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba); } close($final_output); #print Dumper (\%orf); # open the prokka annotation gff file # sample gff file: ###gff-version 3 ##sequence-region contig_3875 1 10320 #contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001 #contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002 #contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003 open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt"; print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n"; open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n"; while (<$gff>) { if ($_ =~ /^##/){ if ($_ =~ /^##FASTA/){ last; } else{ next; } } chomp $_; my @orf_id = (); my @columns = split (/\t/, $_); #my @last_column = split (/\t/, $columns[8]); # $columns[8] =~ /(ID=)/; # print $1; if ($_ =~ /ID=(\w+);/) { push @orf_id, $1; # print Dumper (\@orf_id); } foreach my $value_in_orf_array (@orf_id) { if (exists $hashy{$value_in_orf_array}) { # print "$value_in_orf_array\t$columns[3]\t$columns[4]"; my @print_array = [$value_in_orf_array, $columns[3], $columns[4]]; # unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array); # $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t"; print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n"; } } } close($gff); close($SUPER_FINAL_OUTPUT); print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n"; ###################################################################### # CUSTOM SUBS ###################################################################### # who needs custom subs... ###################################################################### # TEMPLATE SUBS ###################################################################### # PARAMETERS sub checkParams { #----- # Do any and all options checking here... # my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s"); my %options; # Add any other command line options, and the code to handle them # GetOptions( \%options, @standard_options ); # if no arguments supplied print the usage and exit # exec("pod2usage $0") if (0 == (keys (%options) )); # If the -help option is set, print the usage and exit # exec("pod2usage $0") if $options{'help'}; # Compulsory items #if(!exists $options{''} ) { printParamError (""); } if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); } return \%options; } sub printParamError { #----- # What to do if there's something wrong with a parameter # my ($error) = @_; print "**ERROR: $0 : $error\n"; exec("pod2usage $0"); } sub overrideDefault { #----- # Set and override default values for parameters # my ($default_value, $option_name) = @_; if(exists $global_options->{$option_name}) { return $global_options->{$option_name}; } return $default_value; } ##################################################################### # FILE IO sub openWrite { #----- # Open a file for writing # my ($fn) = @_; open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n"; return $fh; } sub openRead { #----- # Open a file for reading # my ($fn) = @_; open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n"; return $fh; } ###################################################################### # EXTERNAL COMMANDS # # checkAndRunCommand("ls", { # -a => "" # }, # WARN_ON_FAILURE); sub checkFileExists { #----- # Does a file exists? # my ($file) = @_; unless(-e $file) { croak "**ERROR: $0 : Cannot find:\n$file\n"; } } sub logExternalCommand { #----- # Log a command line command to the command line! # if(1 == $global_log_commands) { print $_[0], "\n"; } } sub isCommandInPath { #----- # Is this command in the path? # my ($cmd, $failure_type) = @_; if (system("which $cmd |> /dev/null")) { handleCommandFailure($cmd, $failure_type); } } sub runExternalCommand { #----- # Run a command line command on the command line! # my ($cmd) = @_; logExternalCommand($cmd); system($cmd); } sub checkAndRunCommand { #----- # Run external commands more sanelier # my ($cmd, $params, $failure_type) = @_; isCommandInPath($cmd, $failure_type); # join the parameters to the command my $param_str = join " ", map {formatParams($_)} @{$params}; my $cmd_str = $cmd . " " . $param_str; print "The command currently running:\t$cmd_str\n"; logExternalCommand($cmd_str); # make sure that all went well if (system($cmd_str)) { handleCommandFailure($cmd_str, $failure_type) } } sub formatParams { #--------- # Handles and formats the different ways of passing parameters to # checkAndRunCommand # my $ref = shift; if (ref($ref) eq "ARRAY") { return join(" ", @{$ref}); } elsif (ref($ref) eq "HASH") { return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref}); } croak 'The elements of the $params argument in checkAndRunCommand can ' . 'only contain references to arrays or hashes\n'; } sub handleCommandFailure { #----- # What to do when all goes bad! # my ($cmd, $failure_type) = @_; if (defined($failure_type)) { if ($failure_type == DIE_ON_FAILURE) { croak "**ERROR: $0 : " . $! . "\n"; } elsif ($failure_type == WARN_ON_FAILURE) { carp "**WARNING: $0 : " . $! . "\n"; } } } ###################################################################### # MISC sub printAtStart { print<<"EOF"; ---------------------------------------------------------------- $0 post_annotateM - annotate my genome Due to the blast processes against multiple databases, this whole annotation pipeline will usually take awhile. Please be patient! What you get in the end will save you heaps of time. ---------------------------------------------------------------- EOF } __DATA__ =head1 NAME post_annotateM =head1 COPYRIGHT Copyright (C) Mohamed Fauzi Haroon Special appearance from Adam Skarshewski This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =head1 DESCRIPTION Want to annotate your genome? annotateM! =head1 SYNOPSIS post_annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff] -i final_output.txt final_output.txt file generated from annotateM -l locustag Name of locus tag -g locustag.gff location of gff file in prokka_annotation folder [-help -h] Displays basic usage information =cut