#!/usr/bin/env perl
###############################################################################
#
# post_annotateM
#
# The idea here is an automated way of annotating your genome based on
# multiple available databases and to produce a tab-delimited file of
# all the annotations, evalues, scores, descriptions.
#
# Suggested workflow:
# 1) run your genome nucleotide fasta file through annotateM
# 2) then run post_annotateM to include the contig id,orf_start and end
# 3) generate a tab-delimited file
# 4) open the file in ms excel or oo calc
# 5) manually curate the annotations based on evalues/scores/desc etc
# 6) metabolic reconstruction of organism
#
# Copyright (C) Mohamed Fauzi Haroon
# Special appearance from Adam Skarshewski
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
###############################################################################
#pragmas
use strict;
use warnings;
#core Perl modules
use Getopt::Long;
use Carp;
use Data::Dumper;
#CPAN modules
#locally-written modules
BEGIN {
select(STDERR);
$| = 1;
select(STDOUT);
$| = 1;
}
# edit here to log all external commands
my $global_log_commands = 0;
# ext command failure levels
use constant {
IGNORE_FAILURE => 0,
WARN_ON_FAILURE => 1,
DIE_ON_FAILURE => 2
};
# get input params and print copyright
printAtStart();
my $global_options = checkParams();
######################################################################
# CODE HERE
######################################################################
# check that the final_output.txt file exists
checkFileExists($global_options->{'in'});
# identify the ORF called amino acid fasta file
my $locus = $global_options->{'locustag'};
# gff file
my $gff_file = $global_options->{'gff'};
# to store hash
my %hashy = ();
my %orf = ();
# open my final_output.txt file
open my $final_output, "./final_output.txt", or die "Couldn't open final_output.txt\n";
while (<$final_output>)
{
#skips first line
next if $. < 2;
chomp $_;
my @columns = split (/\t/, $_);
my @baba = @columns[1..$#columns];
$hashy{$columns[0]} = join("\t", @baba);
#$combined_bighash{$columns[0]}->{'uniref'} = join("\t", @baba);
# push @{$combined_bighash{$columns[0]}->{'b-uniref'}}, join("\t", @baba);
}
close($final_output);
#print Dumper (\%orf);
# open the prokka annotation gff file
# sample gff file:
###gff-version 3
##sequence-region contig_3875 1 10320
#contig_3875 Prodigal:2.60 CDS 334 735 . + 0 ID=test_00001;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK10707;locus_tag=test_00001;product=putative NUDIX hydrolase;protein_id=gnl|VBC|test_00001
#contig_3875 Prodigal:2.60 CDS 930 3221 . + 0 ID=test_00002;eC_number=1.1.1.40;gene=maeB;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P76558;locus_tag=test_00002;product=NADP-dependent malic enzyme;protein_id=gnl|VBC|test_00002
#contig_3875 Prodigal:2.60 CDS 3229 5175 . - 0 ID=test_00003;inference=ab initio prediction:Prodigal:2.60;locus_tag=test_00003;product=hypothetical protein;protein_id=gnl|VBC|test_00003
open my $SUPER_FINAL_OUTPUT, "> ./super_final_output.txt";
print {$SUPER_FINAL_OUTPUT} "ORF_ID\tcontig_ID\tORF_start_pos\tORF_end_pos\tstrand\timg_evalue\timg_score\timg_gene\timg_organism\timg_reciprocal\tuniref_evalue\tuniref_score\tuniref_gene\tuniref_organism\tuniref_reciprocal\tprokka_gene\tcog_evalues\tcog_scores\tcog_classes\tcog_gene_acronyms\tcog_genes\tpfam_evalues\tpfam_scores\tpfam_genes\ttigrfam_evalues\ttigrfam_scores\ttigrfam_genes\ttigrfam_descriptions\n";
open my $gff, "./$gff_file", or die "Couldn't open $locus.gff\n";
while (<$gff>)
{
if ($_ =~ /^##/){
if ($_ =~ /^##FASTA/){
last;
}
else{
next;
}
}
chomp $_;
my @orf_id = ();
my @columns = split (/\t/, $_);
#my @last_column = split (/\t/, $columns[8]);
# $columns[8] =~ /(ID=)/;
# print $1;
if ($_ =~ /ID=(\w+);/)
{
push @orf_id, $1;
# print Dumper (\@orf_id);
}
foreach my $value_in_orf_array (@orf_id)
{
if (exists $hashy{$value_in_orf_array})
{
# print "$value_in_orf_array\t$columns[3]\t$columns[4]";
my @print_array = [$value_in_orf_array, $columns[3], $columns[4]];
# unshift @{$hashy{$value_in_orf_array}}, join("\t", @print_array);
# $hashy{$value_in_orf_array} .= "\t" . $value_in_orf_array . "\t$columns[3]\t$columns[4]\t";
print {$SUPER_FINAL_OUTPUT} "$value_in_orf_array\t$columns[0]\t$columns[3]\t$columns[4]\t$columns[6]\t$hashy{$value_in_orf_array}\n";
}
}
}
close($gff);
close($SUPER_FINAL_OUTPUT);
print "COMPLETED, super_final_output.txt generated! If i was you, i will open super_final_output.txt in excel\n";
######################################################################
# CUSTOM SUBS
######################################################################
# who needs custom subs...
######################################################################
# TEMPLATE SUBS
######################################################################
# PARAMETERS
sub checkParams {
#-----
# Do any and all options checking here...
#
my @standard_options = ( "help|h+", "in|i:s", "locustag|l:s", "gff|g:s");
my %options;
# Add any other command line options, and the code to handle them
#
GetOptions( \%options, @standard_options );
# if no arguments supplied print the usage and exit
#
exec("pod2usage $0") if (0 == (keys (%options) ));
# If the -help option is set, print the usage and exit
#
exec("pod2usage $0") if $options{'help'};
# Compulsory items
#if(!exists $options{''} ) { printParamError (""); }
if(!exists $options{'in'} ) { printParamError ("You MUST supply your final_output.txt from annotateM"); }
return \%options;
}
sub printParamError
{
#-----
# What to do if there's something wrong with a parameter
#
my ($error) = @_;
print "**ERROR: $0 : $error\n"; exec("pod2usage $0");
}
sub overrideDefault
{
#-----
# Set and override default values for parameters
#
my ($default_value, $option_name) = @_;
if(exists $global_options->{$option_name})
{
return $global_options->{$option_name};
}
return $default_value;
}
#####################################################################
# FILE IO
sub openWrite
{
#-----
# Open a file for writing
#
my ($fn) = @_;
open my $fh, ">", $fn or croak "**ERROR: could not open file: $fn for writing $!\n";
return $fh;
}
sub openRead
{
#-----
# Open a file for reading
#
my ($fn) = @_;
open my $fh, "<", $fn or croak "**ERROR: could not open file: $fn for reading $!\n";
return $fh;
}
######################################################################
# EXTERNAL COMMANDS
#
# checkAndRunCommand("ls", {
# -a => ""
# },
# WARN_ON_FAILURE);
sub checkFileExists {
#-----
# Does a file exists?
#
my ($file) = @_;
unless(-e $file) {
croak "**ERROR: $0 : Cannot find:\n$file\n";
}
}
sub logExternalCommand
{
#-----
# Log a command line command to the command line!
#
if(1 == $global_log_commands) {
print $_[0], "\n";
}
}
sub isCommandInPath
{
#-----
# Is this command in the path?
#
my ($cmd, $failure_type) = @_;
if (system("which $cmd |> /dev/null")) {
handleCommandFailure($cmd, $failure_type);
}
}
sub runExternalCommand
{
#-----
# Run a command line command on the command line!
#
my ($cmd) = @_;
logExternalCommand($cmd);
system($cmd);
}
sub checkAndRunCommand
{
#-----
# Run external commands more sanelier
#
my ($cmd, $params, $failure_type) = @_;
isCommandInPath($cmd, $failure_type);
# join the parameters to the command
my $param_str = join " ", map {formatParams($_)} @{$params};
my $cmd_str = $cmd . " " . $param_str;
print "The command currently running:\t$cmd_str\n";
logExternalCommand($cmd_str);
# make sure that all went well
if (system($cmd_str)) {
handleCommandFailure($cmd_str, $failure_type)
}
}
sub formatParams {
#---------
# Handles and formats the different ways of passing parameters to
# checkAndRunCommand
#
my $ref = shift;
if (ref($ref) eq "ARRAY") {
return join(" ", @{$ref});
} elsif (ref($ref) eq "HASH") {
return join(" ", map { $_ . " " . $ref->{$_}} keys %{$ref});
}
croak 'The elements of the $params argument in checkAndRunCommand can ' .
'only contain references to arrays or hashes\n';
}
sub handleCommandFailure {
#-----
# What to do when all goes bad!
#
my ($cmd, $failure_type) = @_;
if (defined($failure_type)) {
if ($failure_type == DIE_ON_FAILURE) {
croak "**ERROR: $0 : " . $! . "\n";
} elsif ($failure_type == WARN_ON_FAILURE) {
carp "**WARNING: $0 : " . $! . "\n";
}
}
}
######################################################################
# MISC
sub printAtStart {
print<<"EOF";
----------------------------------------------------------------
$0
post_annotateM - annotate my genome
Due to the blast processes against multiple databases, this whole
annotation pipeline will usually take awhile. Please be patient!
What you get in the end will save you heaps of time.
----------------------------------------------------------------
EOF
}
__DATA__
=head1 NAME
post_annotateM
=head1 COPYRIGHT
Copyright (C) Mohamed Fauzi Haroon
Special appearance from Adam Skarshewski
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
=head1 DESCRIPTION
Want to annotate your genome? annotateM!
=head1 SYNOPSIS
post_annotateM -i [finaloutputfromannotatem] -l [locustag] -g [gff]
-i final_output.txt final_output.txt file generated from annotateM
-l locustag Name of locus tag
-g locustag.gff location of gff file in prokka_annotation folder
[-help -h] Displays basic usage information
=cut