#!/usr/bin/env perl use 5.18.0; use strict; use warnings; use Getopt::Std; use File::Basename; use IO::Uncompress::AnyUncompress qw(anyuncompress $AnyUncompressError); use IO::Handle; #...................................................................................... our $VERSION = "0.9.0"; our $EXE = basename($0); our $STDIN = '/dev/stdin'; # '-' will be replaced with this our $URL = 'https://github.com/tseemann/any2fasta'; #...................................................................................... my %aa = ( 'ALA' => 'A', 'ARG' => 'R', 'ASN' => 'N', 'ASP' => 'D', 'CYS' => 'C', 'GLU' => 'E', 'GLN' => 'Q', 'GLY' => 'G', 'HIS' => 'H', 'ILE' => 'I', 'LEU' => 'L', 'LYS' => 'K', 'MET' => 'M', 'PHE' => 'F', 'PRO' => 'P', 'SER' => 'S', 'THR' => 'T', 'TRP' => 'W', 'TYR' => 'Y', 'VAL' => 'V', # Optional: Inclusion of non-standard or ambiguous codes 'SEC' => 'U', 'PYL' => 'O', 'ASX' => 'B', 'GLX' => 'Z', 'XAA' => 'X', 'TER' => '*' ); my $AA = join '', sort values %aa; my $DNA = 'ATGCN'; #...................................................................................... sub usage { my($errcode) = @_; $errcode ||= 0; my $ofh = $errcode ? \*STDERR : \*STDOUT; print $ofh "NAME\n $EXE $VERSION\n", "SYNOPSIS\n Convert various sequence formats into FASTA\n", "USAGE\n $EXE [opts] file.{gb,fa,fq,gff,gfa,clw,sth}[.gz,bz2,zip,zstd] > output.fasta\n", "OPTIONS\n", " -h Print this help\n", " -v Print version and exit\n", " -q No output while running, only errors\n", " -k Skip, don't die, on bad input files\n", " -n Purify sequences: DNA='N' AA='X'\n", " -p Force protein mode (disables auto-detect)\n", " -l Lowercase the sequence\n", " -u Uppercase the sequence\n", " -g Include VERSION (GENBANK,EMBL)\n", " -s Strip sequence descriptions (FASTA,FASTQ)\n", "HOMEPAGE\n $URL\n", "END\n"; exit($errcode); } sub version { print "$EXE $VERSION\n"; exit(0); } #...................................................................................... my %opt; getopts('vhqnlukgsp', \%opt) or exit(-1); $opt{v} and version(); $opt{h} and usage(0); @ARGV or usage(1); my $prot = $opt{p}; #...................................................................................... sub msg { print STDERR "@_\n" unless $opt{q}; } sub wrn { msg("WARNING:", @_); } sub err { print STDERR "ERROR: @_\n"; exit(-1); } #...................................................................................... msg("This is $EXE $VERSION"); # regexp to function mapping # put in order of most-likely to be encountered my @formats = ( [ 'GENBANK', qr/^LOCUS\h/, \&parse_genbank ], [ 'UNIPROT', qr/^ID\h+Reviewed/, \&parse_embl ], [ 'EMBL', qr/^ID\h/, \&parse_embl ], [ 'FASTA', qr/^>\S/, \&parse_fasta ], [ 'FASTQ', qr/^@\S/, \&parse_fastq ], [ 'GFF', qr/^##gff/, \&parse_gff ], [ 'CLUSTAL', qr/^(CLUST|MUSCL)/, \&parse_clustal ], [ 'STOCKHOLM', qr/^# STOCKHOLM\s/, \&parse_stockholm ], [ 'GFA', qr/^[A-Z]\t/, \&parse_gfa ], [ 'PDB', qr/^HEADER\h/, \&parse_pdb ], ); # loop over all positional command line arguments my $processed=0; # are we in '-k' skip mode? my $notify = $opt{k} ? \&wrn :\&err; FILE: for my $infile (@ARGV) { msg("Opening '$infile'"); $infile = $STDIN if $infile eq '-'; if (-d $infile) { $notify->("'$infile' is a directory not a file"); next FILE; } if (! -r $infile) { $notify->("'$infile' is not readable"); next FILE; } # read first line to see if we have any data my $unzip = IO::Uncompress::AnyUncompress->new( $infile, 'MultiStream' => 1, # fix Issue #34 'Transparent' => 1, # allow non-compressed data 'BlockSize'=> (1 << 16), # in bytes ); if (! $unzip) { $notify->($AnyUncompressError); next FILE; } my $header = scalar(<$unzip>); # read first line if (not $header) { $notify->("The input appears to be empty"); next FILE; } # detect format from first line my $ok=0; for my $fmt (@formats) { if ($header =~ $fmt->[1]) { msg("Detected", $fmt->[0], "format"); # read in the rest of the file now my @line = ($header, <$unzip>); my $lines = scalar(@line); msg("Read $lines lines from '$infile'"); # run the parser my $nseq = $fmt->[2]->( \@line ); msg("Wrote $nseq sequences from", $fmt->[0], "file."); $nseq or $notify->("No sequences found in '$infile'"); $ok++; last; } } $ok or $notify->("Unfamilar format with first line: $header"); $processed++; } msg("Processed $processed files."); msg("Done."); exit(0); #...................................................................................... sub purify_seq { my($seq) = @_; if ($opt{n}) { if ($prot or $opt{'p'}) { $seq =~ s/[^${AA}\v\-]/X/gi; } else { $seq =~ s/[^${DNA}\v\-]/N/gi; } } $seq = lc($seq) if $opt{l}; $seq = uc($seq) if $opt{u}; return $seq; } #...................................................................................... sub is_protein { my $seq = shift; my $len = length($seq); $seq =~ s/[ATCG]//gi; #msg("is_prot: $len / $seq"); return length($seq)/$len > 0.666; } #...................................................................................... sub purify_id { my($id) = @_; # \V is non-(vertical-space) eg. cr nl # \h is horizonatel-spce eg. spc tan # we don't use $ as that strips the newline $id =~ s/\h\V*// if $opt{'s'}; return $id; } #...................................................................................... sub parse_fasta { my($lines) = @_; my $count=0; my $sl = 0; for my $line (@$lines) { next if $line =~ m/^\s*$/; if ($line =~ m/^>/) { $line = purify_id($line); $count++; $sl=0; } else { $sl++; # for first seq line in file, detect type if ($count==1 and $sl==1 and is_protein($line)) { $prot = is_protein($line); msg("Auto-detected alphabet:", $prot ? 'PROT' : 'DNA'); } $line = purify_seq($line); } print $line; } return $count; } #...................................................................................... sub parse_fastq { my($lines) = @_; my $count=0; # jump 4 lines at a time for ( my $i=0 ; $i < $#{$lines} ; $i+=4 ) { $lines->[$i] = purify_id($lines->[$i]); print ">", substr($lines->[$i], 1); print purify_seq( $lines->[$i+1] ); $count++; } return $count; } #...................................................................................... sub parse_gff { my($lines) = @_; # Skip past until we get to ##FASTA section while (my $line = shift @$lines) { if ($line =~ m/^>/) { unshift @$lines, $line; # let the FASTA parser do the work now return parse_fasta(\@$lines); } } return 0; } #...................................................................................... # https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md # https://github.com/GFA-spec/GFA-spec/blob/master/GFA2.md sub parse_gfa { my($lines) = @_; my %S; my %P; my $count=0; for my $line (@$lines) { my(@x) = split m/\t/, $line; # this is NOT the original contigs, rather the unitigs # need to parse L (link) and P (path) to reconstruct the contigs if ($x[0] eq 'S') { print ">", $x[1], "\n", purify_seq($x[2]), "\n"; $count++; } } return $count; } #...................................................................................... sub parse_genbank { my($lines) = @_; my $acc = ''; my $dna = ''; my $in_seq = 0; my $count = 0; foreach (@$lines) { chomp; if (m{^//}) { print ">", $acc, "\n", purify_seq($dna); $count++; $in_seq = 0; $dna = $acc = ''; next; } elsif (m/^ORIGIN/) { # ORIGIN $in_seq = 1; next; } if ($in_seq) { # 421 ctctcaaact aaagccgtct cactctccat gagtcgttcg acagatcgcg ttttaaattg my $s = substr $_, 10; # trim the coordinate prefix $s =~ s/\s//g; $dna .= $s . "\n"; } else { # gbk = LOCUS NZ_AHMY02000075 683 bp DNA linear CON 23-NOV-2017 # gp = LOCUS KJK60552 826 aa linear PLN 18-MAR-2015 if (m/^LOCUS\s+(\S+)/) { $acc = $1; $prot = m/\b\d+ aa\b/; # DNA or AA mode? } # VERSION NZ_AHMY02000075.1 # this is not elsif in case ther eis no VERSION if ($opt{g} and m/^VERSION\s+(\S+)/) { $acc = $1; } } } return $count; } #...................................................................................... sub parse_embl { my($lines) = @_; my $acc = ''; my $in_seq = 0; my $dna = ''; my $count = 0; foreach (@$lines) { chomp; if (m{^//}) { # end of record print ">", $acc, "\n", purify_seq($dna); $count++; $in_seq = 0; $dna = $acc = ''; next; } elsif (m/^SQ\s/) { # SQ Sequence 569 BP; 145 A; 133 C; 152 G; 139 T; 0 other; # SQ SEQUENCE 259 AA; 27989 MW; 498AB8B2B9722DEC > $in_seq = 1; $prot = m/\b\d+ AA\b/; # choose AA or DNA mode next; } if ($in_seq) { # agtcgctttt aattatgaat gttgtaacta cattatcatc gctgtcagtc ttctggctgg 60 # MGIFDGKVAI ITGGGKAKSI GYGIAVAYAK EGANLVLTGR NEQKLLDAKE ELERLYGIKV #s/[\s\d]//g; s/\h//g; s/\d+$//; $dna .= $_ . "\n"; } else { # ID K02675; SV 1; linear; genomic DNA; STD; UNC; 569 BP. if (m/^ID\h+([^;\h]+)/) { $acc = $1; if ($opt{g} and m/\bSV\s+(\d+)/) { $acc .= ".$1"; } } } } return $count; } #...................................................................................... sub parse_clustal { my($lines) = @_; my %seq; my @order; # keep track of sequence identifiers order foreach (@$lines) { # uses '-' for gap, ignore optional position number next unless m'^(\S+)\s+([A-Z-]+)(?:\s+\d+)?$'i; push @order, $1 unless exists $seq{$1}; $seq{$1} .= $2."\n"; } for my $id (@order) { print ">", $id, "\n", purify_seq($seq{$id}); } return scalar(keys %seq); } #...................................................................................... sub parse_stockholm { my($lines) = @_; my %seq; my @order; # keep track of sequence identifiers order foreach (@$lines) { next if m/^#/; last if m{^//}; # uses '.' and also '-' for gap, optional position number next unless m'^(\S+)\s+([A-Z.-]+)(?:\s+\d+)?$'i; my($id,$sq) = ($1,$2); $sq =~ s/\./-/g; # switch to standard FASTA '-' gap char push @order, $id unless exists $seq{$id}; $seq{$id} .= $sq . "\n"; } for my $id (@order) { print ">", $id, "\n", purify_seq($seq{$id}); } return scalar(keys %seq); } #...................................................................................... sub parse_pdb { my($lines) = @_; my %seq; # HEADER IMMUNE SYSTEM 06-MAR-00 1EK3 my @hdr = split ' ', $lines->[0]; my $prefix = $hdr[-1]; foreach (@$lines) { #SEQRES 9 A 114 GLY GLY GLY THR LYS VAL GLU IL E LYS ARG #SEQRES 1 B 114 ASP ILE VAL MET THR GLN SER PR O ASP SER LEU ALA VAL next unless m/^SEQRES/; my @x = split ' ', $_; $seq{"$prefix-$x[2]"} .= join( '', map { $aa{uc($_)} || 'X' } splice(@x,4) ); } for my $id (sort keys %seq) { print ">", $id, "\n", purify_seq($seq{$id}), "\n"; } return scalar(keys %seq); } #......................................................................................