#!/usr/bin/env perl
use 5.18.0;
use strict;
use warnings;
use Getopt::Std;
use File::Basename;
use IO::Uncompress::AnyUncompress qw(anyuncompress $AnyUncompressError);
use IO::Handle;

#......................................................................................

our $VERSION = "0.9.0";
our $EXE = basename($0);
our $STDIN = '/dev/stdin'; # '-' will be replaced with this
our $URL = 'https://github.com/tseemann/any2fasta';

#......................................................................................

my %aa = (
  'ALA' => 'A', 'ARG' => 'R', 'ASN' => 'N', 'ASP' => 'D',
  'CYS' => 'C', 'GLU' => 'E', 'GLN' => 'Q', 'GLY' => 'G',
  'HIS' => 'H', 'ILE' => 'I', 'LEU' => 'L', 'LYS' => 'K',
  'MET' => 'M', 'PHE' => 'F', 'PRO' => 'P', 'SER' => 'S',
  'THR' => 'T', 'TRP' => 'W', 'TYR' => 'Y', 'VAL' => 'V',
  # Optional: Inclusion of non-standard or ambiguous codes
  'SEC' => 'U', 'PYL' => 'O', 'ASX' => 'B', 'GLX' => 'Z', 
  'XAA' => 'X', 'TER' => '*'
);
my $AA = join '', sort values %aa;
my $DNA = 'ATGCN';
#......................................................................................

sub usage {
  my($errcode) = @_;
  $errcode ||= 0;
  my $ofh = $errcode ? \*STDERR : \*STDOUT;
  print $ofh 
    "NAME\n  $EXE $VERSION\n",
    "SYNOPSIS\n  Convert various sequence formats into FASTA\n",
    "USAGE\n  $EXE [opts] file.{gb,fa,fq,gff,gfa,clw,sth}[.gz,bz2,zip,zstd] > output.fasta\n",
    "OPTIONS\n",
    "  -h       Print this help\n",
    "  -v       Print version and exit\n",
    "  -q       No output while running, only errors\n",
    "  -k       Skip, don't die, on bad input files\n",
    "  -n       Purify sequences: DNA='N' AA='X'\n",
    "  -p       Force protein mode (disables auto-detect)\n",
    "  -l       Lowercase the sequence\n",
    "  -u       Uppercase the sequence\n",
    "  -g       Include VERSION (GENBANK,EMBL)\n",
    "  -s       Strip sequence descriptions (FASTA,FASTQ)\n",
    "HOMEPAGE\n  $URL\n",
    "END\n";
  exit($errcode);
}

sub version {
  print "$EXE $VERSION\n";
  exit(0);
}

#......................................................................................

my %opt;
getopts('vhqnlukgsp', \%opt) or exit(-1);
$opt{v} and version();
$opt{h} and usage(0);
@ARGV or usage(1);
my $prot = $opt{p};

#......................................................................................
sub msg {
  print STDERR "@_\n" unless $opt{q};
}
sub wrn {
  msg("WARNING:", @_);
}
sub err {
  print STDERR "ERROR: @_\n";
  exit(-1);
}
#......................................................................................

msg("This is $EXE $VERSION");

# regexp to function mapping
# put in order of most-likely to be encountered
my @formats = (
  [ 'GENBANK',   qr/^LOCUS\h/,       \&parse_genbank   ],
  [ 'UNIPROT',   qr/^ID\h+Reviewed/, \&parse_embl      ],
  [ 'EMBL',      qr/^ID\h/,          \&parse_embl      ],
  [ 'FASTA',     qr/^>\S/,           \&parse_fasta     ],
  [ 'FASTQ',     qr/^@\S/,           \&parse_fastq     ],
  [ 'GFF',       qr/^##gff/,         \&parse_gff       ],
  [ 'CLUSTAL',   qr/^(CLUST|MUSCL)/, \&parse_clustal   ],
  [ 'STOCKHOLM', qr/^# STOCKHOLM\s/, \&parse_stockholm ],
  [ 'GFA',       qr/^[A-Z]\t/,       \&parse_gfa       ],
  [ 'PDB',       qr/^HEADER\h/,      \&parse_pdb       ],
);

# loop over all positional command line arguments
my $processed=0;

# are we in '-k' skip mode?
my $notify = $opt{k} ? \&wrn :\&err;

FILE:
for my $infile (@ARGV) {
  msg("Opening '$infile'");
  $infile = $STDIN if $infile eq '-';
  if (-d $infile) {
    $notify->("'$infile' is a directory not a file");
    next FILE;
  }
  if (! -r $infile) {
    $notify->("'$infile' is not readable");
    next FILE;
  }
  # read first line to see if we have any data
  my $unzip = IO::Uncompress::AnyUncompress->new(
    $infile,
    'MultiStream' => 1,  # fix Issue #34
    'Transparent'  => 1, # allow non-compressed data 
    'BlockSize'=> (1 << 16), # in bytes
  );
  if (! $unzip) {
    $notify->($AnyUncompressError);
    next FILE;
  }
  my $header = scalar(<$unzip>); # read first line
  if (not $header) {
    $notify->("The input appears to be empty");
    next FILE;
  }

  # detect format from first line
  my $ok=0;
  for my $fmt (@formats) {
    if ($header =~ $fmt->[1]) {
      msg("Detected", $fmt->[0], "format");
      # read in the rest of the file now
      my @line = ($header, <$unzip>);
      my $lines = scalar(@line);
      msg("Read $lines lines from '$infile'");
      # run the parser
      my $nseq = $fmt->[2]->( \@line );
      msg("Wrote $nseq sequences from", $fmt->[0], "file.");
      $nseq or $notify->("No sequences found in '$infile'");
      $ok++;
      last;
    }
  }
  $ok or $notify->("Unfamilar format with first line: $header");
  $processed++;
}

msg("Processed $processed files.");

msg("Done.");
exit(0);

#......................................................................................

sub purify_seq {
  my($seq) = @_;
  if ($opt{n}) {
    if ($prot or $opt{'p'}) {
      $seq =~ s/[^${AA}\v\-]/X/gi;
    }
    else {
      $seq =~ s/[^${DNA}\v\-]/N/gi;
    }
  }            
  $seq = lc($seq) if $opt{l};
  $seq = uc($seq) if $opt{u};
  return $seq;
}

#......................................................................................

sub is_protein {
  my $seq = shift;
  my $len = length($seq);
  $seq =~ s/[ATCG]//gi;
  #msg("is_prot: $len / $seq");
  return length($seq)/$len > 0.666;
}

#......................................................................................

sub purify_id {
  my($id) = @_;
  # \V is non-(vertical-space) eg. cr nl
  # \h is     horizonatel-spce eg. spc tan
  # we don't use $ as that strips the newline
  $id =~ s/\h\V*// if $opt{'s'};
  return $id;
}

#......................................................................................

sub parse_fasta {
  my($lines) = @_;
  my $count=0;
  my $sl = 0;
  for my $line (@$lines) {
    next if $line =~ m/^\s*$/;
    if ($line =~ m/^>/) {
      $line = purify_id($line);
      $count++;
      $sl=0;
    }
    else {
      $sl++;
      # for first seq line in file, detect type
      if ($count==1 and $sl==1 and is_protein($line)) {
        $prot = is_protein($line);
        msg("Auto-detected alphabet:", $prot ? 'PROT' : 'DNA');
      }
      $line = purify_seq($line);
    }
    print $line;
  }
  return $count;
}

#......................................................................................

sub parse_fastq {
  my($lines) = @_;
  my $count=0;
  # jump 4 lines at a time
  for ( my $i=0 ; $i < $#{$lines} ; $i+=4 ) {
    $lines->[$i] = purify_id($lines->[$i]);
    print ">", substr($lines->[$i], 1);
    print purify_seq( $lines->[$i+1] );
    $count++;
  }
  return $count;
}

#......................................................................................

sub parse_gff {
  my($lines) = @_;
  # Skip past until we get to ##FASTA section
  while (my $line = shift @$lines) {
    if ($line =~ m/^>/) {
      unshift @$lines, $line;
      # let the FASTA parser do the work now
      return parse_fasta(\@$lines);
    }
  }
  return 0;
}

#......................................................................................
# https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 
# https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md 

sub parse_gfa {
  my($lines) = @_;
  my %S;
  my %P;
  my $count=0;
  for my $line (@$lines) {
    my(@x) = split m/\t/, $line;
    # this is NOT the original contigs, rather the unitigs
    # need to parse L (link) and P (path) to reconstruct the contigs
    if ($x[0] eq 'S') {
      print ">", $x[1], "\n", purify_seq($x[2]), "\n";
      $count++;
    }
  }
  return $count;
}

#......................................................................................

sub parse_genbank {
  my($lines) = @_;
  my $acc = '';
  my $dna = '';
  my $in_seq = 0;
  my $count = 0;

  foreach (@$lines) {
    chomp;
    if (m{^//}) {
      print ">", $acc, "\n", purify_seq($dna);
      $count++;
      $in_seq = 0;
      $dna = $acc = '';
      next;
    }
    elsif (m/^ORIGIN/) {
      # ORIGIN
      $in_seq = 1;
      next;
    }
    
    if ($in_seq) {
      #       421 ctctcaaact aaagccgtct cactctccat gagtcgttcg acagatcgcg ttttaaattg
      my $s = substr $_, 10;  # trim the coordinate prefix
      $s =~ s/\s//g;
      $dna .= $s . "\n";
    }
    else {
      # gbk = LOCUS  NZ_AHMY02000075  683 bp   DNA   linear   CON 23-NOV-2017
      # gp  = LOCUS  KJK60552     826 aa  linear   PLN 18-MAR-2015
      if (m/^LOCUS\s+(\S+)/) {
        $acc = $1;
        $prot = m/\b\d+ aa\b/; # DNA or AA mode?
      }
      # VERSION     NZ_AHMY02000075.1
      # this is not elsif in case ther eis no VERSION
      if ($opt{g} and m/^VERSION\s+(\S+)/) {
        $acc = $1;
      }
    }
  }
  return $count;
}

#......................................................................................

sub parse_embl {
  my($lines) = @_;
  my $acc = '';
  my $in_seq = 0;
  my $dna = '';
  my $count = 0;

  foreach (@$lines) {
    chomp;
    if (m{^//}) {
      # end of record
      print ">", $acc, "\n", purify_seq($dna);
      $count++;
      $in_seq = 0;
      $dna = $acc = '';
      next;
    }
    elsif (m/^SQ\s/) {
      # SQ   Sequence 569 BP; 145 A; 133 C; 152 G; 139 T; 0 other;
      # SQ   SEQUENCE   259 AA;  27989 MW;  498AB8B2B9722DEC >
      $in_seq = 1;
      $prot = m/\b\d+ AA\b/; # choose AA or DNA mode
      next;
    }

    if ($in_seq) {
      #      agtcgctttt aattatgaat gttgtaacta cattatcatc gctgtcagtc ttctggctgg        60
      #      MGIFDGKVAI ITGGGKAKSI GYGIAVAYAK EGANLVLTGR NEQKLLDAKE ELERLYGIKV
      #s/[\s\d]//g;
      s/\h//g;
      s/\d+$//;
      $dna .= $_ . "\n";
    }
    else {
      # ID   K02675; SV 1; linear; genomic DNA; STD; UNC; 569 BP.
      if (m/^ID\h+([^;\h]+)/) {
        $acc = $1;
        if ($opt{g} and m/\bSV\s+(\d+)/) {
          $acc .= ".$1";
        }
      }
    }
  }
  return $count;
}

#......................................................................................

sub parse_clustal {
  my($lines) = @_;
  my %seq;
  my @order;  # keep track of sequence identifiers order
  foreach (@$lines) {
    # uses '-' for gap, ignore optional position number
    next unless m'^(\S+)\s+([A-Z-]+)(?:\s+\d+)?$'i;
    push @order, $1 unless exists $seq{$1};
    $seq{$1} .= $2."\n";
  }
  for my $id (@order) {
    print ">", $id, "\n", purify_seq($seq{$id});
  }
  return scalar(keys %seq);
}

#......................................................................................

sub parse_stockholm {
  my($lines) = @_;
  my %seq;
  my @order;  # keep track of sequence identifiers order
  foreach (@$lines) {
    next if m/^#/;
    last if m{^//};
    # uses '.' and also '-' for gap, optional position number
    next unless m'^(\S+)\s+([A-Z.-]+)(?:\s+\d+)?$'i; 
    my($id,$sq) = ($1,$2);
    $sq =~ s/\./-/g;  # switch to standard FASTA '-' gap char
    push @order, $id unless exists $seq{$id};
    $seq{$id} .= $sq . "\n";
  }
  for my $id (@order) {
    print ">", $id, "\n", purify_seq($seq{$id});
  }
  return scalar(keys %seq);
}

#......................................................................................

sub parse_pdb {
  my($lines) = @_;
  my %seq;
  # HEADER IMMUNE SYSTEM  06-MAR-00   1EK3
  my @hdr = split ' ', $lines->[0];
  my $prefix = $hdr[-1];
  foreach (@$lines) {
    #SEQRES   9 A  114  GLY GLY GLY THR LYS VAL GLU IL E LYS ARG
    #SEQRES   1 B  114  ASP ILE VAL MET THR GLN SER PR O ASP SER LEU ALA VAL
    next unless m/^SEQRES/;
    my @x = split ' ', $_;
    $seq{"$prefix-$x[2]"} .= 
      join( '', map { $aa{uc($_)} || 'X'  } 
        splice(@x,4) );
  }
  for my $id (sort keys %seq) {
    print ">", $id, "\n", purify_seq($seq{$id}), "\n";
  }
  return scalar(keys %seq);
}

#......................................................................................