#! /usr/bin/perl # reformat.pl # Reformat a multiple alignment file # # HHsuite version 3.0.0 (15-03-2015) # # Reference: # Remmert M., Biegert A., Hauser A., and Soding J. # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). # (C) Johannes Soeding, 2012 # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de use strict; use warnings; my $numres=100; # number of residues per line my $desclen=1000; # maximum number of characters in nameline my $ARGC=scalar(@ARGV); if ($ARGC<2) { die(" reformat.pl from HHsuite3 Read a multiple alignment in one format and write it in another format Usage: reformat.pl [informat] [outformat] infile outfile [options] or reformat.pl [informat] [outformat] 'fileglob' .ext [options] Available input formats: fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps aligned to inserts: '.' a3m: like a2m, but gaps aligned to inserts MAY be omitted sto: Stockholm format; sequences in several blocks with sequence name at beginning of line (hmmer output) psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r) clu: Clustal format; sequences in several blocks with sequence name at beginning of line Available output formats: fas: aligned fasta; all gaps '-' a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps aligned to inserts: '.' a3m: like a2m, but gaps aligned to inserts are omitted sto: Stockholm format; sequences in just one block, one line per sequence psi: format as read by PSI-BLAST using the -B option clu: Clustal format If no input or output format is given the file extension is interpreted as format specification ('aln' as 'clu') Options: -v int verbose mode (0:off, 1:on) -num add number prefix to sequence names: 'name', '1:name' '2:name' etc -noss remove secondary structure sequences (beginning with >ss_) -sa do not remove solvent accessibility sequences (beginning with >sa_) -M first make all columns with residue in first sequence match columns (default for output format a2m or a3m) -M int make all columns with less than X% gaps match columns (for output format a2m or a3m) -r remove all lower case residues (insert states) (AFTER -M option has been processed) -r int remove all lower case columns with more than X% gaps -g '' suppress all gaps -g '-' write all gaps as '-' -uc write all residues in upper case (AFTER all other options have been processed) -lc write all residues in lower case (AFTER all other options have been processed) -l number of residues per line (for Clustal, FASTA, A2M, A3M formats) (default=$numres) -d maximum number of characers in nameline (default=$desclen) Examples: reformat.pl 1hjra.a3m 1hjra.a2m (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m) reformat.pl test.a3m test.fas -num -r 90 reformat.pl fas sto '*.fasta' .stockholm \n"); # clu: clustal format (hmmer output) # sel: Selex format # phy: Phylip format } my $informat=""; my $outformat=""; my $infile=""; my $outfile=""; my $num=0; # don't use sequence number as prefix: '>n|name' my $noss=0; # don't remove secondary structure my $nosa=1; # remove solvent accessibility sequences my $line; my $options=""; my @names; # names of sequences read in my @seqs; # residues of sequences read in my $n; # number of sequences read in my $k; # counts sequences for output my $remove_inserts=0; my $remove_gapped=0; my $matchmode=""; # do not change capitalization my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns my $v=2; my $update=0; my $nss=-1; # index of secondary structure sequence my $lname; # length of sequence name in clustal, stockholm, psi format my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files my @informats= ("fas","a2m","a3m","sto","psi","clu"); my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas"); my $found; my $element; my $gap="default"; my $case="default"; # Process optionsz for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";} if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;} if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;} if ($options=~s/ -num / /) {$num=1; $desclen=505;} if ($options=~s/ -noss / /) {$noss=1;} if ($options=~s/ -sa / /) {$nosa=0;} if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;} if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;} if ($options=~s/ -r / /) {$remove_inserts=1;} if ($options=~s/ -lc / /) {$case="lc";} if ($options=~s/ -uc / /) {$case="uc";} if ($options=~s/ -v\s*(\d+) / /) {$v=$1;} if ($options=~s/ -v / /) {$v=2;} if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;} if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;} if ($options=~s/ -u / /) {$update=1;} if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;} if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;} if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;} # Assign informat, outformat, infile, and outfile if ($outfile eq "") { if ($options=~s/(\S+)\s*$//) { $outfile=$1; } else { die("Error: no output file given: '$options'\n"); } } if ($infile eq "") { if ($options=~s/(\S+)\s*$//) { $infile=$1; } else { die("Error: no input file given: '$options'\n"); } } if ($options=~s/(\S+)\s*$//) { $outformat=$1; } else { if ($outfile=~/\S*\.(\S+?)$/) { $outformat=lc($1); if ($outformat eq "aln") {$outformat="clu";} elsif ($outformat eq "fa") {$outformat="fas";} elsif ($outformat eq "fasta") {$outformat="fas";} elsif ($outformat eq "afa") {$outformat="fas";} elsif ($outformat eq "afas") {$outformat="fas";} elsif ($outformat eq "afasta") {$outformat="fas";} } else { print ("Using FASTA output format: '$options'\n"); $outformat="fas"; } } if ($options=~s/(\S+)\s*$//) { $informat=$1; } else { if ($infile=~/\S*\.(\S+?)$/) { $informat=lc($1); if ($informat eq "aln") {$informat="clu";} elsif ($informat eq "fa") {$informat="fas";} elsif ($informat eq "fasta") {$informat="fas";} } else { print ("Using FASTA input format: '$options'\n"); $informat="fas"; } } # Warn if unknown options found if ($options!~/^\s*$/) { $options=~s/^\s*(.*?)\s*$/$1/g; print("\nWARNING: unknown options '$options'\n"); } # Check if input and output formats are valid $found=0; foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}} if(!$found) {die("\nError: $informat is not a valid input format option\n");} $found=0; foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}} if(!$found) {die("\nError: $outformat is not a valid output format option\n");} #if($outformat eq "psi") { # $remove_inserts=1; #} if($outformat eq "ufas") {$gap="";} if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension { $outfile=~/.*\.(\S*)$/; my $outext=$1; my @infiles=glob($infile); printf("%i files to reformat\n",scalar(@infiles)); foreach $infile (@infiles) { if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/} $outfile="$1.$outext"; if ($update && -e $outfile) {next;} if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} &reformat($infile,$outfile); } exit 0; } else { if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} &reformat($infile,$outfile); exit 0; } ################################################################################################ # Reformat a single file ################################################################################################ sub reformat() { $infile=$_[0]; $nss=-1; $titleline=""; ################################################################################################ # Input part ################################################################################################ my $skip=0; open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n"); # Read a2m, a3m, fas format if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m") { $/=">"; # set input field seperator $n=0; my $seq=; if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file # If first line has no sequence name, use >root_name if ($seq ne ">") { $infile="/$infile."; # determine root name 1 $infile=~/^.*\/(.*?)\..*/; # determine root name 2 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; $seq=~tr/\n //d; # remove newlines and blanks $seqs[$n]=$seq; $n++; } while ($seq=) { $seq=~s/\n\#.*//g; # remove commentary lines while ($seq=~s/(.)>/$1/) {$seq.=;} # in the case that nameline contains a '>'; '.' matches anything except '\n' if ($seq=~/^aa_/) {next;} if ($seq=~/^sa_/ && $nosa) {next;} if ($seq=~/^ss_/) { if ($noss) {next;} # do not read in >ss_ and >sa_ sequences # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines $nss=$n; } $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n' if (defined $1 && $1 ne "") { $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; } else { $names[$n]=$n; } $seqs[$n]=$seq; $n++; } $/="\n"; # reset input field seperator } # Read Stockholm format elsif ($informat eq "sto") { my %seqhash; my $name; my $first_block=1; $n=0; while ($line = ) { $line=~tr/\r//d; $line=~s/\s+/ /g; if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line if ($line=~/^\#/) {next;} # skip commentary and blank line if ($line=~/^\/\//) {last;} # reached end of file if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts if ($line!~/^\s*(\S+)\s+(\S+)/) { die ("\nERROR found in stockholm format: $!"); } if (!(exists $seqhash{$1})) { if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences if ($line=~/^ss_/) { if ($noss) {next;} # do not read in >ss_ and >aa_ sequences # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines $nss=$n; } $line=~/^\s*(\S+)\s+(\S+)/; $names[$n]=$1; $seqs[$n]=$2; $seqhash{$1}=$n++; $first_block=1; } else { if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");} $seqs[$seqhash{$1}].=$2; } # printf("%3i %s\n",$n-1,$seqs[$n-1]); } } elsif ($informat eq "clu") { my $residues_per_line=50; # default number of characters for namefield residues # (only needed if no gap between name and residues in first sequence -> SMART) my $block=1; # number of block currently read my $name; my $residues; $n=0; # sequence number of first block $k=0; # sequence index to zero for first block while ($line = ) { # print($namefieldlen.$line); $line=~tr/\r//d; if ($line=~/CLUSTAL/i) {next;} if ($line=~/^\#/) {next;} # skip commentary and blank line if ($line=~/^\/\//) {last;} # reached end of file if ($line=~/^\s*$/){ # new sequence block starts if ($k) { if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} $block++; $n=$k; $k=0; # set sequence index to zero for next block to read } next; } if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) { if ($line=~/^[*.: ]*$/) {next;} if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences chomp($line); if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) { die ("\nError found in Clustal format in $infile, line $.: '$line'\n"); } $name=$1; $residues=$2; print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n"); } else { if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences if ($line=~/^ss_/) { # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines $nss=$n; } $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; $name=$1; $residues=$2; $residues=~tr/ //d; $residues_per_line=length($residues); } if ($block==1) { $names[$k]=$name; $seqs[$k]=$residues; } else { $seqs[$k].=$residues; if ($names[$k] ne $name) { print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); } } # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); $k++; } if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} if (!$n) {$n=$k;} } # Read psi format elsif ($informat eq "psi") { my $block=1; # number of block currently read my $name; my $residues; $n=0; # sequence number of first block $k=0; # sequence index to zero for first block while ($line = ) { # print($namefieldlen.$line); $line=~tr/\r//d; if ($line=~/^\s*$/){ # new sequence block starts if ($k) { if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} $block++; $n=$k; $k=0; # set sequence index to zero for next block to read } next; } if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences if ($line=~/^ss_/) { # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines $nss=$n; } $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; $name=$1; $residues=$2; $residues=~tr/ //d; if ($block==1) { $names[$k]=$name; $seqs[$k]=$residues; } else { $seqs[$k].=$residues; if ($names[$k] ne $name) { print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); } } # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); $k++; } if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} if (!$n) {$n=$k;} } close INFILE; # Empty input file? if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");} ################################################################################################ # Transforming to upper-case ################################################################################################ if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} } ################################################################################################ # Removing non-alphanumeric symbols ################################################################################################ for ($k=0; $k<$n; $k++) { $seqs[$k]=~tr/A-Za-z0-9.~-//cd; $seqs[$k]=~tr/~/-/; } ################################################################################################ # Filling up with gaps '.' or deleting gaps ################################################################################################ # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M option if ($informat eq "a3m" && (!$remove_inserts || $matchmode)) { print("inserting gaps...\n"); my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i. my $j; # counts match states my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state my $insert; # Determine length of longest insert for ($k=0; $k<$n; $k++) { # split into list of single match states and variable-length inserts # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements # The '#' symbol is prepended to get rid of a perl bug in split @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); $j=0; # printf("Sequence $k: $seqs[$k]\n"); # printf("Sequence $k: @inserts\n"); # Does sequence $k contain insert after match state j that is longer than $ngap[$j]? foreach $insert (@inserts) { if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);} $j++; # printf("$insert|"); } # printf("\n"); } my $ngap; # Fill up inserts with gaps '.' up to length $len_ins[$j] for ($k=0; $k<$n; $k++) { # split into list of single match states and variable-length inserts @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); $j=0; # append the missing number of gaps after each match state foreach $insert (@inserts) { for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";} $j++; } $seqs[$k] = join("",@inserts); $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end } } ################################################################################################ # Match state assignment ################################################################################################ # Default match state rule for a3m is -M first if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";} # Use gap rule for match state assignment? if ($matchmode eq "gaprule") { my @gaps=(); my $residues; my @residues; # Determine number of gaps per column for ($k=0; $k<$n; $k++) { @residues=unpack("C*",$seqs[$k]); for (my $l=0; $l<@residues; $l++) { if ($residues[$l]==46 || $residues[$l]==45) { if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} } } } # Set columns with more gaps than $match_gaprule to lower case, for ($k=0; $k<$n; $k++) { @residues=unpack("C*",$seqs[$k]); $residues=""; for (my $l=0; $l<@residues; $l++) { if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) { if ($residues[$l]==46) { $residues .= "-"; } else { $residues .= uc(chr($residues[$l])); } } else { if ($residues[$l]==45) { $residues .= "."; } else { $residues .= lc(chr($residues[$l])); } } $seqs[$k]=$residues; } } } # Use first sequence for match state assignment? if ($matchmode eq "first") { my @match=(); my $residues; my @residues; # Determine which columns have a gap in first sequence for ($k=0; $k=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");} splice(@seqs,$k,1); splice(@names,$k,1); $k--; $n--; } } # Cut down sequence names to $desclen characters maximum for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);} if ($outformat eq "a3m") { # Remove all '.' gaps for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;} } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) { # Substitute all '.' by '-' for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;} } if ($gap ne "default") { for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;} } if ($case eq "uc") { for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} } elsif ($case eq "lc") { for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;} } #################################################### # Check that sequences have same length #################################################### if ($outformat ne "a3m" && $outformat ne "ufas") { my $len=length($seqs[0]); for($k=1; $k<$n; $k++) { if (length($seqs[$k])!=$len) { printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n", $names[0],$len,$names[$k],length($seqs[$k])); if ($v>=3) { printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]); } exit 1; } } } #################################################### # Remove html tags #################################################### for($k=0; $k<$n; $k++) { $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags } ################################################################################################ # Output part ################################################################################################ my $ndssp=-1; my $nsa=-1; my $npred=-1; my $nconf=-1; my $nquery=-1; for ($k=0; $k<$n; $k++) { if ($names[$k]=~/^ss_dssp/){$ndssp=$k; } elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; } elsif ($names[$k]=~/^ss_pred/){$npred=$k; } elsif ($names[$k]=~/^ss_conf/){$nconf=$k; } elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;} } #Write sequences into output file open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n"); if ($outformat eq "sto" || $outformat eq "psi") { my $refline; if ($outformat eq "sto") { print(OUTFILE "# STOCKHOLM 1.0\n\n"); # Write reference annotation line for match states (-M first) if (!$lname) {$lname=32;} if ($names[$nquery] =~ /^\S+\s+(.*)/) { printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE", $1); } $refline=$seqs[$nquery]; $refline=~s/[a-z]/-/g; printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline); if ($ndssp>=0) { printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]); } } if ($num) { my $num=2; for ($k=0; $k<$n; $k++) { if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} $names[$k]=~s/^(\S+)\#\d+/$1/; $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; $num++; } } for ($k=0; $k<$n; $k++) { if ($k==$ndssp || $k==$npred || $k==$nconf) {next;} $names[$k] =~ /\s*(\S+)/; if (!$lname) {$lname=32;} printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]); } if ($outformat eq "sto") {print(OUTFILE "//\n");} } elsif ($outformat eq "clu") { printf(OUTFILE "CLUSTAL\n\n\n"); if ($num) { my $num=2; for ($k=0; $k<$n; $k++) { if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} $names[$k]=~s/^(\S+)\#\d+/$1/; $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/; $num++; } } while($seqs[0] ne "") { # While there are still residues left, write new block of sequences for ($k=0; $k%s\n%s\n",$names[$k],$seqs[$k]); } } close OUTFILE; if ($v>=2) { if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");} else { if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); } print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n"); } } return; }