#!/usr/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) BuildDatabase.pl ## Author: ## Arian Smit ## Robert Hubley ## Description: ## A utility for creating a WU-Blast/RepeatModeler XDF database ## from a set of fasta files. ## #****************************************************************************** #* Copyright (C) Institute for Systems Biology 2004 Developed by #* Arian Smit and Robert Hubley. #* #* This work is licensed under the Open Source License v2.1. To view a copy #* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or #* see the license.txt file contained in this distribution. #* ############################################################################### # # To Do: # # =head1 NAME BuildDatabase - Format FASTA files for use with RepeatModeler =head1 SYNOPSIS BuildDatabase [-options] -name "mydb" or BuildDatabase [-options] -name "mydb" -dir or BuildDatabase [-options] -name "mydb" -batch =head1 DESCRIPTION This is basically a wrapper around AB-Blast's and NCBI Blast's DB formating programs. It assists in aggregating files for processing into a single database. Source files can be specified by: - Placing the names of the FASTA files on the command line. - Providing the name of a directory containing FASTA files with the file suffixes *.fa or *.fasta. - Providing the name of a manifest file which contains the names of FASTA files ( fully qualified ) one per line. NOTE: Sequence identifiers are not preserved in this database. Each sequence is assigned a new GI ( starting from 1 ). The translation back to the original sequence is preserved in the *.translation file. The options are: =over 4 =item -h(elp) Detailed help =item -name The name of the database to create. =item -engine The name of the search engine we are using. I.e abblast/wublast or rmblast. =item -dir The name of a directory containing fasta files to be processed. The files are recognized by their suffix. Only *.fa and *.fasta files are processed. =item -batch The name of a file which contains the names of fasta files to process. The files names are listed one per line and should be fully qualified. =back =head1 SEE ALSO =over 4 RepeatModeler, RMBlast =back =head1 COPYRIGHT Copyright 2004-2019 Institute for Systems Biology =head1 AUTHOR Robert Hubley =cut # # Module Dependence # use strict; use FindBin; use lib $FindBin::RealBin; use Getopt::Long; use Pod::Text; use File::Temp qw/ tempfile tempdir /; use Cwd; # RepeatModeler Libraries use RepModelConfig; use lib $RepModelConfig::configuration->{'REPEATMASKER_DIR'}->{'value'}; # # Class Globals & Constants # my $CLASS = "BuildDatabase"; my $DEBUG = 0; $DEBUG = 1 if ( $RepModelConfig::DEBUGALL == 1 ); my $version = $RepModelConfig::VERSION; # # Option processing # e.g. # -t: Single letter binary option # -t=s: String parameters # -t=i: Number paramters # my @opts = qw( help dir=s engine=s batch=s name=s ); # Add configuration parameters as additional command-line options push @opts, RepModelConfig::getCommandLineOptions(); # # Provide the POD text from this file and # from the config file by merging them # together. The heading "CONFIGURATION # OVERRIDES" provides the insertion point # for the configuration POD. # sub usage { my $p = Pod::Text->new(); $p->output_fh( *STDOUT ); my $pod_str; open IN, "<$0" or die "Could not open self ($0) for generating documentation!"; while ( ) { if ( /^=head1\s+CONFIGURATION OVERRIDES\s*$/ ) { my $c_pod = RepModelConfig::getPOD(); if ( $c_pod ) { $pod_str .= $_ . $c_pod; } } else { $pod_str .= $_; } } close IN; print "$0 - $version\n"; $p->parse_string_document( $pod_str ); exit( 1 ); } # # Get the supplied command line options, and set flags # my %options = (); unless ( &GetOptions( \%options, @opts ) ) { usage(); } # Print the internal POD documentation if something is missing if ( ( $#ARGV == -1 && !$options{'batch'} && !$options{'dir'} ) || $options{'help'} ) { print "No query sequence file indicated\n\n"; usage(); } # # Resolve configuration settings using the following precedence: # command line first, then environment, followed by config # file. # RepModelConfig::resolveConfiguration( \%options ); my $config = $RepModelConfig::configuration; my $XDFORMAT_PRGM = $config->{'ABBLAST_DIR'}->{'value'} . "/xdformat"; my $NCBIBLASTDB_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/makeblastdb"; my $NCBIDBCMD_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/blastdbcmd"; my $engine = "rmblast"; if ( $options{'engine'} ) { if ( $options{'engine'} =~ /wublast|abblast/i ) { die "RepeatModeler doesn't appear to be configured to use the " . "$options{'engine'} search engine. Please re-run configure before " . "using this or other RepeatModeler programs.\n" if ( !-x $XDFORMAT_PRGM ); $engine = "abblast"; } elsif ( $options{'engine'} =~ /rmblast|ncbi/i ) { die "RepeatModeler doesn't appear to be configured to use the " . "$options{'engine'} search engine. Please re-run configure before " . "using this or other RepeatModeler programs.\n" if ( !-x $NCBIBLASTDB_PRGM ); $engine = "rmblast"; } else { print "I don't understand -engine $options{'engine'}\n"; usage(); } } die "Missing database name!\n" if ( !defined $options{'name'} ); my @fileList = (); # Directory specified if ( $options{'dir'} ) { if ( -d $options{'dir'} ) { # Process the directory opendir DIR, $options{'dir'} or die "Cannot open dir $options{'dir'}!\n"; my $file; while ( defined( $file = readdir( DIR ) ) ) { if ( $file =~ /\.fa$/i || $file =~ /\.fasta$/i || $file =~ /\.fast$/i || $file =~ /\.dna/i ) { push @fileList, "$options{'dir'}/$file"; } } closedir( DIR ); } else { die "Directory $options{'dir'} doesn't exist!\n"; } } # Batch specified if ( $options{'batch'} ) { if ( -s $options{'batch'} ) { open BATCH, "<$options{'batch'}" or die "Cannot open batch file $options{'batch'}!\n"; while ( ) { s/[\n\r\s]+//g; if ( -s $_ ) { push @fileList, $_; } else { die "File $_ from batch $options{'batch'} does not exist!\n"; } } close BATCH; } else { die "Batch file $options{'batch'} doesn't exist!\n"; } } # Push remaining command line files foreach my $file ( @ARGV ) { if ( -f $file ) { push @fileList, $file; } else { die "Command line fasta file $file does not exist!\n"; } } # # Sanitize sequence names # my $IDX; my $index = 1; open $IDX, ">$options{'name'}.translation" or die $CLASS . ": Cannot open file $options{'name'}.translation\n"; my $results = ""; my $dbSeqs = 0; my $dbSize = 0; print "Building database $options{'name'}:\n"; # Sanitizing the names of all input databases and placing # in one big tempfile # $fh = tempfile(); my ( $SEQ, $seqFilename ) = tempfile( DIR => "." ); my $file; for ( my $i = 0 ; $i <= $#fileList ; $i++ ) { $file = $fileList[ $i ]; print " Reading $file...\n"; if ( $file =~ /^.*\.gz$/ ) { open IN, "gunzip -c $file|" or die $CLASS . ": Cannot open file using gunzip $file\n"; } else { open IN, "<$file" or die $CLASS . ": Cannot open file $file\n"; } while ( ) { if ( /^\>\s*(\S+)\s+(.*)/ ) { print $SEQ ">gi|$index\n"; print $IDX "$1\t$index\n"; $index++; } else { print $SEQ $_; } } close IN; } close $SEQ; $file = $seqFilename; if ( $engine eq "rmblast" ) { my $cmd = "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $options{'name'} -parse_seqids -dbtype nucl -in $file 2>&1"; print "Running: $cmd\n" if ( $DEBUG ); $results = `$cmd`; if ( $? ) { print "The makeblastdb program exited with code " . ($? >> 8) . ". Please check your input file(s) for potential formating errors.\n$NCBIBLASTDB_PRGM returned: $results\n"; print "The command used was: $cmd\n"; die; } } else { $results = `$XDFORMAT_PRGM -n -o $options{'name'} $file 2>&1`; while ( $results =~ /sequences\s*\(letters\)\s*(?:written|appended):\s*([\d,]+)\s+\(([\d,]+)\).*$/mg ) { my $size = $2; my $seqs = $1; $size =~ s/,//g; $dbSize += $size; $seqs =~ s/,//g; $dbSeqs += $seqs; last; } } if ( $engine eq "abblast" ) { $results = `$XDFORMAT_PRGM -n -X $options{'name'} 2>&1`; } else { $results = `$NCBIDBCMD_PRGM -db $options{'name'} -info 2>&1`; while ( $results =~ /\s+([\d\,]+)\s+sequences;\s+([\d\,]+)\s+total bases.*$/mg ) { my $size = $2; my $seqs = $1; $size =~ s/,//g; $dbSize += $size; $seqs =~ s/,//g; $dbSeqs += $seqs; last; } } print "Number of sequences (bp) added to database: $dbSeqs ( $dbSize bp )\n"; close $IDX; unlink( $seqFilename ) if ( -e $seqFilename ); 1;