#!/usr/bin/env bash

#-------------------------------------------------------------------------------------------------------
#: PROGRAM: run_phylip.sh
#: AUTHOR: Pablo Vinuesa, Center for Genomic Sciences, UNAM, Mexico
#:         https://www.ccg.unam.mx/~vinuesa/ twitter: @pvinmex
#
#: PROJECT START: October 16th, 2013
#    This program has been developed mainly for teaching purposes, 
#    with improvements/new features added as the script was used in 
#    diverse courses taught to undergrads at https://www.lcg.unam.mx
#    (LCG-UNAM) and the International Workshops on Bioinformatics (TIB)

#: AIM: run PHYLIP's distance methods [NJ|UPGMA] for DNA and proteins (dnadist|protdist) with optional bootstrapping
#       This script was written to teach intermediate Bash scripting to my students at the 
#       Bachelor's Program in Genome Sciences at the Center for Genome Sciences, UNAM, Mexico
#       https://www.lcg.unam.mx
#
#: INPUT: multiple sequence alignments (PROT|DNA) with at least 4 distinct sequences in phylip format
#
#: OUTPUT: [NJ|UPGMA] phylogenies and, if requested, bootstrap consensus trees

#: SOURCE: Freely available on GitHub @ https://github.com/vinuesa/intro2linux
#          Released under the GPLv3 License. 
#          http://www.gnu.org/copyleft/gpl.html

#### DEPENDENCIES 
# Assumes that the following binaries and scripts are all in $PATH, checked by check_dependencies()
#
#   1) Binaries from the PHYLIP package: 
#	seqboot dnadist protdist neighbor consense
#   NOTE: Linux-compatible PHYLIP binaries are supplied in the distro\'s bin/ directory
#           https://github.com/vinuesa/intro2linux/tree/master/bin  

#: TODO:
#    implement also parsimony and ML analyses and parallelize bootstrapping

#: KNOWN BUGS: None.
#    Please report any errors you may encounter through the GitHub issue pages
#-------------------------------------------------------------------------------------------------------

# make sure the user has at least bash version 4, since the script uses standard arrays (introduced in version 4),
#  but future development may require hashes, introduced in version 4
[ "${BASH_VERSION%%.*}" -lt 4 ] && echo "$HOSTNAME is running an ancient bash: ${BASH_VERSION}; ${0##*/} requires bash version 4 or higher" && exit 1

# 0. Define strict bash settings
set -e              # exit on non-zero exit status
set -u              # exit if unset variables are encountered
set -o pipefail     # exit after unsuccessful UNIX pipe command

# set and export LC_NUMERIC=en_US.UTF-8, to avoid problems with locales tha use 1,32
LC_NUMERIC=en_US.UTF-8
export LC_NUMERIC

args="$*"
progname=${0##*/} # run_phylip.sh
VERSION=2.0 

# GLOBALS
#DATEFORMAT_SHORT="%d%b%y" # 16Oct13
#TIMESTAMP_SHORT=$(date +${DATEFORMAT_SHORT})

date_F=$(date +%F |sed 's/-/_/g')-   # 2013_10_20
date_T=$(date +%T |sed 's/:/./g')    # 23.28.22 (hr.min.secs)
start_time="$date_F$date_T"
#current_year=$(date +%Y)

wkdir=$(pwd)

# initialize variables
def_DNA_model=F84
def_prot_model=JTT
input_phylip=
runmode=
boot=100
CV=1
model=
DEBUG=0
sequential=0
TiTv=2
upgma=0
outgroup=0
gamma="0.0"
outgroup=1

nw_utils_ok=

declare -a outfiles # to collect the output files written to disk

#---------------------------------------------------------------------------------#
#>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#
#---------------------------------------------------------------------------------#
function print_dev_history()
{
    cat <<EOF_
    
    $progname $VERSION has been developed mainly for teaching purposes, 
      with improvements/new features added as the script was used in 
      diverse courses taught to undergrads at https://www.lcg.unam.mx
      and the International Workshops on Bioinformatics (TIB)
      
    TODO: * implement also parsimony and ML analyses
          * parallelize bootstrapping  
    
    # v2.0 2020-11-19; * fixed bug in write_protdist_params in the if [ $model == JTT ] || [ ...] conditional
   		       * fixed an error in the model name input checks, changing PMG for PMB
   		       * fixed grep in check_nw_utils_ok and made more robust conditional checking to call the function
   		       * use variable expansion only once in naming outfiles outtress, saving it to a variable
   			 to make the code more streamlined and possibly a tiny bit faster
   		       * removed useless [ -s outtre|outfile ] tests for the orignal tress|outfiles in the bootstrapping block, 
   			   as these are already checked in the first code block computing tress from the original alignment
    
    # v1.6 2020-11-19; * streamlined write_PHYLIP_param functions by grouping echo calls in { } > params; avoiding concatenation 
    #                  * improved description of write_PHYLIP_param functionality

    # v1.5 2020-11-15; * added check_phylip_ok to validate input phylip file with -ge 4 sequences and -ge 4 characters
    		       * added remove_phylip_param_files
    		       * added option -I to call print_install_notes
    		       * added [ "${BASH_VERSION%%.*}" -lt 4 ] && die
    		       * makes more extensive use of internal Bash variables and variable expansions, including \${var/pattern/string}

    # v1.4 2020-11-14; * added function check_nw_utils_ok, to check that nw_display and nw_support can be executed
    			  as they may be in path but cannot find /lib64/libc.so.6: version GLIBC_2.14
    			  when the binaries are compiled with dynamic linking to the libs
    
    # v1.3 2020-11-14; * improved layout of output messages; 
    		       * improved regex in extract_tree_from_outfile (now also for NJ tree)
    		       * if nw_support and nw_display are not available, 
    			 prints both the NJ and boot trees to screen if bootstrap analysis was performed
    
    # v1.2 2020-11-11; set and export LC_NUMERIC=en_US.UTF-8, to avoid problems with locales that use 1,32 instead of 1.32
    
    # v1.1  2020-11-11; added function extract_tree_from_oufile and calls it on NJ|UPGMA bootRepl consensus trees
                          if nw_display is not available in PATH
    
    #v1.0  2020-11-09; This version has thorough error checking and reporting with improved code flow
         *  re-ingeneered the main code block. It now first runs the standard distance-matrix + clustering computations
	        before doing bootstrapping. This helps in rapidly detecting problematic model parameters, as reflected in negative distances
	 *  displays NJ|UPGMA tree without bootstrap values, if no bootstrapping is requested	
	 *  added functions:
	      - check_matrix to check if they contain negative values, issuing an error message
	      - display_treeOI display trees with nw_display, only if they do not contain negative branch lengths, issuing an error message
	      - print_dev_history to clean up the script\'s header section
	 *  made 100% shellcheck-compliant; prints all error messages to >&2 (STDERR)
         *  changed/fixed defaults for CV=1 & gamma="0.0"; 
	 *  use outfiles+=() to capture outfiles for -R 1 and -R 2 when boot -eq 0; now runs with -i -R1|2 as single opts
	 *  Changed default boot=100
	 *  now accepts TiTv=real like 6.7 and checks for real numbers passed to -t
	 *  checks if outtree contains negative branches before attempting to display with nw_display & prints proper message

    # v0.5 2020-11-09; added functions:
    	 *  rdm_odd_int to compute random odd integers to pass to PHYLIP programs
    	 *  check_output (silently checks for expected output files, dying if not found)

    #v0.4 2020-11-08; released to the public domain @ https://github.com/vinuesa/intro2linux
    	   * added nw_support and nw_display calls to map and display bottstrap support values onto NJ or UPGMA trees
    	   * fixed several warnings issued by shellcheck
    	   * added strict bash interpreter calling with set -e; set -u; set -o pipefail
    	   * localized variables received in functions
    	   * explicitly set default_DNA_model=F84 and default_prot_model=JTT
    	   * added option -v to print version and exit
    		
    #v0.3 September 15, 2017; improved checking of user input
    #v0.2 September 16, 2015; basic options and parameter checking for protdist phylogenies
    #v0.1 October 16, 2013; basic options and parameter checking for dnadist phylogenies
EOF_

exit 0
}
#---------------------------------------------------------------------------------#

function print_install_notes()
{
   cat <<INSTALL
   
    #1. PHYLIP PHYLogeny Inference Package by Joe Felensetein https://evolution.genetics.washington.edu/phylip.html
    
    The bin/ directory of the intro2linux distribution provides precompiled Linux-x86_64 binaries 
          for seqboot dnadist protdist neighbor consense. Copy them into your \$HOME/bin directory or any other in your PATH
    
    If you need executables for a different architecture, visit https://evolution.genetics.washington.edu/phylip/executables.html
    
    #2. Newick Utilities http://cegg.unige.ch/newick_utils
    The Newick Utilities have been described in an Open-Access paper (that is, available online for everyone):

    The Newick Utilities: High-throughput Phylogenetic tree Processing in the UNIX Shell
    Thomas Junier and Evgeny M. Zdobnov
    Bioinformatics 2010 26:1669-1670
    http://bioinformatics.oxfordjournals.org/content/26/13/1669.full
    doi:10.1093/bioinformatics/btq243
    
    The src/ dir contains the newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz source file
    
    Visit http://cegg.unige.ch/newick_utils if you need other pre-compiled versions
    
    Brief compilation notes
    
    1. Copy the newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz to a suitable directory in your \$HOME
    2. cd into the directory hoding the source *tar.gz file
    3. unpack and compile with the following commands 
          tar -xvzf newick-utils-1.6-Linux-x86_64-disabled-extra.tar.gz
	  cd newick-utils-1.6
	  ./configure --prefix=$HOME  # if you do NOT have administrator privileges
          #./configure                # if you have superuser privileges
	  make
	  make check
	  make install                # if you do NOT have administrator privileges
	  # sudo make install         # if you have superuser privileges

INSTALL

exit 0

}
#---------------------------------------------------------------------------------#

function check_output()
{
    local outfile=$1
    if [ ! -s "$outfile" ]
    then
        echo
	echo " >>> ERROR! The expected output file $outfile was not produced, will exit now!" >&2
        echo
	exit 2
    fi
}
#---------------------------------------------------------------------------------#

function check_phylip_ok()
{
    # implements a rudimentary format validation check
    local phyfile=$1
    local num_tax=
    local num_char=

    num_tax=$(awk 'NR == 1 {print $1}' "$phyfile")
    num_char=$(awk 'NR == 1 {print $2}' "$phyfile")
    
    if [[ ! "$num_tax" =~ ^([0-9]+)$ ]] ||  [[ ! "$num_char" =~ ^([0-9]+)$ ]]
    then
         echo "ERROR: $phyfile is not a phylip-formatted input file!"
	 echo
	 exit 1  
    elif [[ "$num_tax" =~ ^([0-9]+)$ ]] && [ "$num_tax" -lt 4 ]
    then
         echo "ERROR: $phyfile should contain at least 4 taxa"
	 exit 1
    elif [[ "$num_char" =~ ^([0-9]+)$ ]] && [ "$num_char" -lt 5 ]
    then
         echo "ERROR: $phyfile should contain at least 5 aligned residues"
	 exit 1
    else
        echo "# File validation OK: $phyfile seems to be a standard phylip file ..."
        echo '-------------------------------------------------------------------------------------------'
    fi
}
#---------------------------------------------------------------------------------#

function check_dependencies
{
    # check that the following PHYLIP binaries are in $PATH; die if not
    # optional: the Newick_utilities src is found in the src/ dir
    dependencies=(seqboot dnadist protdist neighbor consense) 
    for programname in "${dependencies[@]}"
    do
       local bin=
       bin=$(type -P "$programname")
       if [ -z "$bin" ]; then
          echo
          echo "# ERROR: $programname not in place!" >&2
          echo "# ... you will need to install \"$programname\" first or include it in \$PATH" >&2
          echo "# ... exiting" >&2
          exit 1
       fi
    done

    echo
    echo '# Run check_dependencies() ... looks good: all required external dependencies are in place!'
    echo '-------------------------------------------------------------------------------------------'
}
#---------------------------------------------------------------------------------#

function check_nw_utils_ok()
{
    # This function checks that nw_display and nw_support can be executed
    #  as they may be in path but cannot find /lib64/libc.so.6: version GLIBC_2.14
    #  when the binaries are compiled with dynamic linking to the libs
    
    # The function returns 0 when nw_support nw_display cannot be executed and 1 when they can
    # The function does not run and return anything if nw_support nw_display are not in PATH
    
    dependencies=(nw_support nw_display)
    
    local nw_utils_ok=
    local bin=
    
    for programname in "${dependencies[@]}"
    do
        bin=$(type -P "$programname")
        if [ -n "$bin" ]; then
	   # prints something like nw_support: /lib64/libc.so.6: version `GLIBC_2.14' not found
	   "$programname" 2>> "nw_utils_check.out.tmp" 
        fi
    done
    
    # check the contents of nw_utils_check.out.tmp and set proper flag
    if [ -s nw_utils_check.out.tmp ]
    then
           # set the nw_utils_ok flag to 0 if the lib is not found, to check later in the code when nw_support and nw_display are called
           grep -i error nw_utils_check.out.tmp &> /dev/null
	   nw_utils_ok=$? 
	   [ "$nw_utils_ok" -eq 0 ] && echo "# WARNING: ${dependencies[*]} are in PATH but cannot find /lib64/libc.so.6: version GLIBC_2.14" >&2 
           rm nw_utils_check.out.tmp 
           echo  "$nw_utils_ok"
    fi
}
#---------------------------------------------------------------------------------#

function rdm_odd_int()
{
  # generate a random odd integer for seqboot and neighbor
  while true
  do 
     i=$RANDOM
     if (( "$i" % 2 )) # true when odd
     then 
         echo "$i" 
	 break
     fi
done

}
#---------------------------------------------------------------------------------#

function check_matrix()
{
    # check that the distance matrix does not contain negative values due to wrong model parameters
    local matrix=$1
    
    if grep -El ' \-[0-9\.]+' "$matrix"
    then
        echo "ERROR: computed negative distances!" >&2
	     [ "$runmode" -eq 1 ] && echo "You may need to ajust the model and|or gamma value; try lowering TiTv if > 6" >&2
	     [ "$runmode" -eq 2 ] && echo "You may need to ajust the matrix and|or gamma value" >&2
	exit 3
    fi
}
#---------------------------------------------------------------------------------#

function display_treeOK()
{
     # checks that tree does not contain branches with negative lengths, 
     #    as these cannot be displayed  with nw_dsiplay
     local tree=$1
     
     # check that there are no negative branch lengths in the nj_tree
     if ! grep -El '\-[0-9\.]+' "$tree"
     then 
     	 echo "# displaying $tree ..."
     	 echo
     	 nw_display -w 80 -Ir "$tree"
     	 echo
     else
     	 echo "ERROR: cannot display $nj_tree with nw_dsiplay, as it contains branches with negative lengths!" >&2
	     [ "$runmode" -eq 1 ] && echo "You may need to adjust (lower?) TiTv=$TiTv" >&2
	     [ "$runmode" -eq 2 ] && echo "You may need to use another matrix or adjunst gamma" >&2
     	 exit 4
     fi
}
#---------------------------------------------------------------------------------#

function extract_tree_from_outfile()
{
    # grep out lines containing tree characters | - and print to STDOUT
    local outfile=$1
   
    grep --color=never -E '[[:blank:]]+\||-' "$outfile" | grep -Ev 'Neighbor-|^---'
}

#------------------------------------- PHYLIP FUNCTIONS ---------------------------------------#
# >>> these are fucntions to write the command files to pass parameters to PHYLIP programs <<< # 

function remove_phylip_param_files()
{
   [ -s dnadist.params ] && rm dnadist.params
   [ -s protdist.params ] && rm protdist.params
   [ -s seqboot.params ] && rm seqboot.params
   [ -s neighbor.params ] && rm neighbor.params
   [ -s consense.params ] && rm consense.params
   return 0
}
#------------------------------------- PHYLIP FUNCTIONS ---------------------------------------#

function write_dnadist_params
{
    # writes a parameter file to run dnadist, based on provided arguments
    local model=$1
    local boot=$2
    local TiTv=$3
    local sequential=$4
    local gamma=$5
    local CV=$6

    # Runmode 1 = dnadist 
    if [ "$model" = 'F84' ] || [ -z "$model" ]
    then 
        {
          [ "$model" = 'Kimura' ]       && echo "D"                 
          [ "$model" = 'Jukes-Cantor' ] && echo -ne "D\nD\n"        
          [ "$model" = 'LogDet' ]       && echo -ne "D\nD\nD\n"     
          [ "$TiTv" != "2" ]            && echo -ne "T\n$TiTv\n"    
          [ "$gamma" != "0" ]           && echo -ne "G\n"           
          [ "$boot" -gt 0 ]             && echo -ne "M\nD\n$boot\n" 
          [ "$sequential" -eq 1 ]       && echo -ne "I\n"           
                                           echo -ne "Y\n"             
          [ "$gamma" != "0" ]           && echo -ne "$CV\n"         
    
        } > dnadist.params
   fi
}
#---------------------------------------------------------------------------------#

function write_protdist_params
{
    # write_protdist_params "$model" "$boot" "$sequential" "$gamma" "$CV"
    # writes a parameter file to run protdist, based on provided arguments
    # Models: JTT, PMB, PAM, Kimura
    local model=$1
    local boot=$2
    local sequential=$3
    local gamma=$4
    local CV=$5

    if [ "$model" = 'JTT' ] || [ -n "$model" ]
    then
       { # Runmode 2 = protdist 
         [ "$model" = 'PMB' ]     && echo "P"                 
         [ "$model" = 'PAM' ]     && echo -ne "P\nP\n"        
         [ "$model" = 'Kimura' ]  && echo -ne "P\nP\nP\n"     
         [ "$gamma" != "0" ]      && echo -ne "G\n"           
         [ "$boot" -gt 0 ]        && echo -ne "M\nD\n$boot\n" 
         [ "$sequential" -eq 1 ]  && echo -ne "I\n"    
                                     echo -ne "Y\n"       
         [ "$gamma" != "0" ]      && echo -ne "$CV\n"   			   
       } > protdist.params
    fi  
}
#---------------------------------------------------------------------------------#

function write_seqboot_params
{
    # writes a parameter file to run seqboot, based on provided arguments
    local boot=$1
    local sequential=$2
    
    # Write Seqboot params
    {
      [ "$boot" -gt 0 ]       && echo -ne "R\n$boot\n" 
      [ "$sequential" -eq 1 ] && echo -ne "I\n"        
                                 echo -ne "Y\n"
                                 echo -ne "$ROI\n"
    } > seqboot.params 			      
}
#---------------------------------------------------------------------------------#

function write_neighbor_params
{
    # writes a parameter file to run neighbor, based on provided arguments
    local boot=$1
    local upgma=$2
    local outgroup=$3
    
    {
       # Write $ROI params
       [ "$upgma" -gt 0 ]    && echo -ne "N\n"                 
       [ "$outgroup" -gt 1 ] && echo -ne "O\n$outgroup\n"      
       [ "$boot" -gt 0 ]     && echo -ne "M\n$boot\n$ROI\n"    
                                echo -ne "Y\n"        
    } > neighbor.params
}
#---------------------------------------------------------------------------------#

function write_consense_params
{
    #writes a parameter file to run consense; very difficult ;)
    [ -s consense.params ] && rm consense.params
    {
      [ "$outgroup" -gt 1 ]  && echo -ne "O\n$outgroup\n"  
                                echo -ne "Y\n"           
    } > consense.params				
}
#---------------------------------------------------------------------------------#

# don't leave litter behind ... remove intermediate input/output files
function cleanup_dir 
{
   [ -s infile ]  && rm infile
   [ -s outfile ] && rm outfile
   [ -s intree ]  && rm intree
   [ -s outtree ] && rm outtree
   
   for file in *.params
   do
       [ -s "$file" ] && rm "$file"
   done    
}

#-------------------------------- END PHYLIP FUNCTIONS----------------------------#

function print_help
{
   #':b:c:m:I:o:R:t:hHIDsuv'
   cat<<EOF
   $progname v.$VERSION [OPTIONS]
   
   -h  prints this help message
   
 REQUIRED
   -i <input_phylip_file> (if sequential, use also flag -s)
   -R <integer>  RUNMODE
        1    run dnadist  (expects DNA  alignment in phylip format)
	2    run protdist (expects PROT alignment in phylip format)

 OPTIONAL
   -b <integer> No. of bootstrap pseudoreplicates [default: $boot]
   -g <real number> alpha for gamma distribution  [default: $gamma]
   -o <integer> outgroup sequence no.             [default: $outgroup]
   -s sequential format; -s (flag; no val)!       [default: interleaved]
   -t <digit> Transition/Transversion ratio       [default: $TiTv]
   -m <model name>  NOTE: TYPE AS SHOWN!
        DNA:  F84, Kimura, Jukes-Cantor, LogDet   [default: $def_DNA_model]
        PROT: JTT, PMB, PAM, Kimura               [default: $def_prot_model]
   -u <flag> use UPGMA for clustering             [default:  NJ]
   -v <flag> print program version and exit
   -D <flag> Activate debugging to keep cmd files [default: $DEBUG]
   -H <flag> print development history and exit   
   -I <flag> print installation notes and exit   
   
 AIM: run PHYLIP\'s distance methods [NJ|UPGMA] for DNA and proteins with bootstrapping
      This code was written to teach basic Bash scripting to my students at the 
      Bachelor\'s Program in Genome Sciences, Center for Genome Sciences, UNAM, Mexico
      https://www.lcg.unam.mx/      

 OUTPUT: [NJ|UPGMA] phylogenies and, if requested, bootstrap consensus trees 
         and  [NJ|UPGMA] phylogenies with bootstrap support values mappend on bipartitions
 
 EXTERNAL DEPENDENCIES: 
   * PHYLIP (https://evolution.genetics.washington.edu/phylip.html) programs:
      seqboot dnadist protdist neighbor consense 
   * Newick utilities programs (http://cegg.unige.ch/newick_utils) programs:
      optional: nw_support and nw_display; need to install separately
   - Notes: PHYLIP Linux 64bit binaries available in the bin/ dir
            Copy them to your \$HOME/bin directory or any other in your PATH
  
 LICENSING & SOURCE
       Author: Pablo Vinuesa | https://www.ccg.unam.mx/~vinuesa/ | twitter: @pvinmex
       Released under the GNU General Public License version 3 (GPLv3)
       http://www.gnu.org/copyleft/gpl.html
       source: https://github.com/vinuesa/intro2linux

EOF

exit 1   
}

#-------------------------------------------------------------------------------------------------#
#---------------------------------------- GET OPTIONS --------------------------------------------#
#-------------------------------------------------------------------------------------------------#

# GETOPTS
while getopts ':b:g:i:m:R:o:t:hDHIsuv' OPTIONS
do
   case $OPTIONS in

   b)   boot=$OPTARG
        ;;
   g)   gamma=$OPTARG
	[ "$gamma" != 0 ] && CV=$(echo "1/sqrt($gamma)" | bc -l) && printf -v CV "%.4f\n" "$CV"
        ;;
   h)   print_help
        ;;
   i)   input_phylip=$OPTARG
        ;;
   s)   sequential=1
        ;;
   m)   model=$OPTARG
        ;;
   o)   outgroup=$OPTARG
        ;;
   R)   runmode=$OPTARG
        ;;
   t)   TiTv=$OPTARG
        ;;
   u)   upgma=1
        ;;
   v)   echo "$progname v$VERSION" && exit
        ;;
   D)   DEBUG=1
        ;;
   H)   print_dev_history
        ;;
   I)   print_install_notes
        ;;
   :)   printf "argument missing from -%s option\n" "$OPTARG"
   	print_help
     	exit 2 
     	;;
   \?)  echo "need the following args: "
   	print_help
        exit 3
	;;
   esac >&2   # print the ERROR MESSAGES to STDERR

done

shift $((OPTIND - 1))

#--------------------------#
# >>> Check User Input <<< #
#--------------------------#

if [ -z "$input_phylip" ]
then
       echo
       echo "# ERROR: no input phylip file defined!"
       print_help
       exit 1    
fi

if [ -z "$runmode" ]
then
       echo
       echo "# ERROR: no runmode defined!"
       print_help
       exit 1    
fi

# automatically set TiTv=0 when running with protein matrices or the LogDet DNA model
[ "$runmode" -eq 2 ] && TiTv=0
[ "$model" == LogDet ] && TiTv=0


# check that bootstrap value is an integer
re='^[0-9]+$'
if [[ ! "$boot" =~ $re ]]
then
   echo
   echo "# ERROR: boot:$boot is not a positive integer >= 0; provide a value between 100 and 1000" >&2 
   echo
   print_help
   echo
   exit 3
fi

# check that Ti/Tv is a real number, integer or decimal
re_TiTv='^[0-9.]+$'
if [[ ! "$TiTv" =~ $re_TiTv ]] && [ "$runmode" -eq 1 ]
then
   echo
   echo "# ERROR: TiTv:$TiTv is not an integer >= 0; provide a value between 0-10" >&2 
   echo
   print_help
   echo
   exit 3
elif [[ "$TiTv" =~ $re_TiTv ]] && [ "$runmode" -eq 1 ] && { [ "$model" == Jukes-Cantor ] || [ "$model" == LogDet ]; }
then
       echo
       echo "# ERROR: $model is only valid when analyzing DNA alignments under the F84 or K2P models" >&2
       echo
       print_help
       exit 1    
fi

if [ "$gamma" != 0 ] # note, need to treat as string, since gamma will be generalle a float like 0.5
then
    # this is to avoid having dots within filename (converts gamma==0.2 to gammaf=02)
    gammaf=${gamma/\./} #gammaf=${gamma%.*}${gamma##*.}
else
    gammaf="$gamma"
fi    

# check model vs. runmode compatibility and suitable bootstrap values are provided
#   using two alternative sintaxes for educational purposes
if [ "$runmode" -eq 1 ] && { [ "$model" = JTT ] || [ "$model" = PMB ] || [ "$model" = PAM ] || [ "$model" = "$def_prot_model" ]; }
then
       echo
       echo "# ERROR: $model is only valid when analyzing protein alignments under -R 2" >&2 
       echo
       print_help
       exit 1    
elif [ "$runmode" -eq 2 ] && [[ "$model" =~ ^(F84|Jukes-Cantor|LogDet|"$def_DNA_model")$ ]]
then
       echo
       echo "# ERROR: $model is only valid when analyzing DNA alignments under -R 1" >&2
       echo
       print_help
       exit 1    
elif [ "$boot" -lt 0 ]
then
       echo
       echo "# ERROR: bootstrap value=$boot is < 0 and not permitted" >&2
       echo
       print_help
       exit 1    
elif [ "$boot" -gt 1000 ]
then
       echo
       echo "# WARNING: bootstrap value=$boot is > 1000. This may take a long time to run."
       echo -n "# Are you sure you want to continue anyway? Type Y|N "
       read -r answer
       if [ "$answer" = N ] || [ "$answer" = n ] 
       then
           echo
	   echo "# Ok, will exit then!"
	   echo
	   exit 2
       else
           echo
	   echo "# Ok, will proceed running with $boot bootstrap pseudoreplicates ..."
	   echo
       fi 
elif [ "$boot" -gt 0 ] && [ "$boot" -lt 100 ]
then
       echo
       echo "# WARNING: bootstrap value=$boot is a bit low. Use a value >= 100."
       echo -n "# Are you sure you want to continue anyway? Type Y|N "
       read -r answer
       if [ "$answer" = N ] || [ "$answer" = n ] 
       then
           echo
	   echo "# Ok, will exit then!"
	   echo
	   exit 2
       else
           echo
	   echo "# Ok, will proceed ..."
	   echo
       fi 
fi

# set the default DNA or PROT models, if not defined by the user
if [ "$runmode" -eq 1 ] && [ -z "$model" ]
then
       model="$def_DNA_model"
       #echo "# DNA model set to default: $def_DNA_model..."       
fi

if [ "$runmode" -eq 2 ] && [ -z "$model" ]

then
       model="$def_prot_model"
       #echo "# Protein matrix set to default: $def_prot_model ..."       
fi

# check that the user provided a valid DNA substitution model
if [ "$runmode" -eq 1 ] && [[ ! "$model" =~ ^(F84|Kimura|Jukes-Cantor|LogDet|"$def_DNA_model")$ ]]
then
   echo
   echo "# ERROR: $model is not a recognized substitution model for DNA sequences used by PHYLIP" >&2 
   echo
   print_help
   echo
   exit 3
fi

# check that the user provided a valid name for empirical substitution matrix
#    note the much shorter and cleaner notation of this test using extended regexes, than the previous one
if [ "$runmode" -eq 2 ] && [[ ! "$model" =~ ^(JTT|PMB|PAM|Kimura)$ ]]
then
   echo
   echo "# ERROR: $model is not a recognized substitution matrix for protein sequences used by PHYLIP" >&2 
   echo
   print_help
   echo
   exit 3
fi

#>>> Set the script's run random odd number required by diverse PHYLIP programs <<<
ROI=$(rdm_odd_int)

# print the run settings
echo
echo "### $progname v.$VERSION run on $start_time with the following parameters:"
echo "# work_directory=$wkdir"
echo "# input_phylip=$input_phylip"
echo "# model=$model | gamma=$gamma | CV=$CV | gammaf=$gammaf | ti/tv ratio=$TiTv |
     outgroup=$outgroup | UPGMA=$upgma | bootstrap no.=$boot | ROI=$ROI"
echo "# runmode=$runmode"
echo "# DEBUG=$DEBUG" 
echo "# command: $progname ${args[*]}"
echo

#-------------------------------------------------------------------------------------------------#
#------------------------------------------ MAIN CODE --------------------------------------------#
#-------------------------------------------------------------------------------------------------#

# 1. make sure the external dependencies are found in PATH
#     and that nw_display and nw_support find the required GLIBC_2.14 in /lib64/libc.so.6
check_dependencies
nw_utils_ok=$(check_nw_utils_ok) # nw_utils_ok -eq 1 when OK, -eq 0 when not

# 2. Start processing the input file
# make sure there are no old outfile or outtree files lying around from previous runs
[ -s infile ] && rm infile
[ -s outfile ] && rm outfile
[ -s outtree ] && rm outtree
# nor old params files
remove_phylip_param_files

# 2.1) make sure we have an input file or die
if [ -s "$input_phylip" ]
then
     # make basic format validation check
     check_phylip_ok "$input_phylip"
     cp "$input_phylip" infile
else
     echo
     echo "# FATAL ERROR: input phylip file $input_phylip does not exist or is empty" >&2 
     echo "# Make sure $input_phylip is in $wkdir. Exiting ..." >&2
     echo
     exit 1
fi

# ----------------------------------------------------------- #
# >>>>>>>>>>>>>>>> Compute distance matrices <<<<<<<<<<<<<<<< #
# ----------------------------------------------------------- #

# 3. In any case (with or without bootstrap) compute the NJ tree for the original phylip file
#     Run dnadist or protdist, as required   
echo
echo ">>> Computing distance matrix for $input_phylip ..."

if [ "$runmode" -eq 1 ]		 
then
    echo "# running write_dnadist_params $model 0 $TiTv $sequential $gamma $CV"
    write_dnadist_params "$model" "0" "$TiTv" "$sequential" "$gamma" "$CV"
    echo "# running dnadist < dnadist.params"
    dnadist < dnadist.params &> /dev/null
    check_output outfile
    
    # https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
    # Set last item specifically
    # nstead of appending one element, set the last item specifically, without any "unbound variable" error
    # t[${#t[*]}]=foo
    dnadist_outfile="${input_phylip%.*}_${model}${gammaf}gamma_distMat.out"
    cp outfile "$dnadist_outfile" && \
      outfiles+=("$dnadist_outfile")
    
    # check that the distance matrix does not contain negative values due to wrong model parameters
    check_matrix "$dnadist_outfile"    
    
    mv outfile infile
elif [ "$runmode" -eq 2 ]
then
    echo "# running write_protdist_params $model 0 $sequential $gamma $CV"
    write_protdist_params "$model" "0" "$sequential" "$gamma" "$CV"
    echo "# running protdist < protdist.params"
    protdist < protdist.params &> /dev/null
    check_output outfile
    
    protdist_outfile="${input_phylip%.*}_${model}${gammaf}gamma_distMat.out"
    cp outfile "$protdist_outfile" && \
      outfiles+=("$protdist_outfile")
    
    # check that the distance matrix does not contain negative values due to wrong model parameters
    check_matrix "$protdist_outfile"
    
    mv outfile infile
fi


# ------------------------------------------------------------------------------------ #
# >>>>>>>>>>>>>>>> Computing NJ|UPGMA trees from original alignments <<<<<<<<<<<<<<<<< #
# ------------------------------------------------------------------------------------ #

echo
echo ">>> Computing distance tree for $input_phylip ..."  

# 4. now that we have the dist matrix, do the clustering with NJ or UPGMA
echo "# running write_neighbor_params 0 $upgma"
write_neighbor_params "0" "$upgma" "$outgroup"
echo "# running neighbor < neighbor.params"
neighbor < neighbor.params &> /dev/null
check_output outfile
check_output outtree

# 5.1 rename outtrees and tree outfiles; remap bootstrap values to bipartitions and display tree to screen
if [ "$upgma" -gt 0 ]
then
     # https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
     # Set last item specifically
     # instead of appending one element, set the last item specifically, without any "unbound variable" error
     # t[${#t[*]}]=foo
     upgma_tree=
     upgma_outfile=
     
     if [ -s outtree ] 
     then
          upgma_tree="${input_phylip%.*}_${model}${gammaf}gamma_UPGMA.ph"
          mv outtree "$upgma_tree"
	  echo "# wrote tree $upgma_tree to disk" 
	  outfiles[${#outfiles[*]}]="$upgma_tree"
     fi 
     
     if [ -s outfile ]
     then
         upgma_outfile="${input_phylip%.*}_${model}${gammaf}gamma_UPGMA.outfile"
         mv outfile "$upgma_outfile"
         outfiles[${#outfiles[*]}]="$upgma_outfile"
     fi 
       
     # check that there are no negative branch lengths in the nj_tree
     #  and display with nw_display, only if no bootstrapping is requested
     if [ "$boot" -eq 0 ] && [[ $(type -P nw_display) ]] && [[ "$nw_utils_ok" -eq 1 ]]
     then 
         display_treeOK "$upgma_tree"
     elif [ "$boot" -eq 0 ] && { [[ ! $(type -P nw_display) ]] || [[ "$nw_utils_ok" -ne 1 ]]; }
     then
          echo "# extract_tree_from_outfile $upgma_outfile"
	  echo
	  extract_tree_from_outfile "$upgma_outfile"
	  echo
     fi	 
else
     nj_tree=
     nj_outfile=

     if [ -s outtree ] 
     then
	  nj_tree="${input_phylip%.*}_${model}${gammaf}gamma_NJ.ph"
          mv outtree "$nj_tree"
          echo "# wrote tree $nj_tree to disk"
          outfiles[${#outfiles[*]}]="$nj_tree"
     fi
     
     if [ -s outfile ] 
     then
          nj_outfile="${input_phylip%.*}_${model}${gammaf}gamma_NJ.outfile"
          mv outfile "$nj_outfile"
          outfiles[${#outfiles[*]}]="$nj_outfile"
     fi

     # check that there are no negative branch lengths in the nj_tree
     #  and display with nw_display, only if no bootstrapping is requested
     if [ "$boot" -eq 0 ] && [[ $(type -P nw_display) ]] && [[ "$nw_utils_ok" -eq 1 ]]
     then
         display_treeOK "$nj_tree"
     elif [ "$boot" -eq 0 ] && { [[ ! $(type -P nw_display) ]] || [[ "$nw_utils_ok" -ne 1 ]]; }
     then
         echo "# extract_tree_from_outfile $nj_outfile"
	 echo
	 extract_tree_from_outfile "$nj_outfile"
	 echo
     fi	 
fi

echo "# > finished computing distance matrix and tree for $input_phylip!"

if [ "$boot" -eq 0 ]
then
    # 6. Print final output summary message
    echo
    echo '===================== OUTPUT SUMMARY ====================='
    no_outfiles=${#outfiles[@]}
    echo "# $no_outfiles output files were generated:"
    printf "%s\n" "${outfiles[@]}"

    echo
    echo -n "# FINISHED run at: "; date 
    echo "   ==> Exiting now ..."
    echo
else
    echo "=================================================================="
    echo
fi


# ----------------------------------------------------------- #
# >>>>>>>>>>>>>>>>>>>> Bootstrap Analysis <<<<<<<<<<<<<<<<<<< #
# ----------------------------------------------------------- #

# Run seqboot if requested
# run seqboot if -b > 0

if [ "$boot" -gt 0 ] 
then
     # 1. restore the original infile for the bootstrapping and standard NJ/UPGMA analysis below
     cp "$input_phylip" infile
     check_output infile

     echo
     echo ">>> Bootstrap Analysis based on $boot pseudoreplicates for $input_phylip ..."
     echo "# running  write_seqboot_params $boot $sequential"
     write_seqboot_params "$boot" "$sequential"
     echo "# running seqboot < seqboot.params &> /dev/null"
     seqboot < seqboot.params &> /dev/null
     check_output outfile
     mv outfile infile

     echo "# > computing distance matrices on $boot bootstrapped alignments ..."
     # 2. if bootstrapping, then compute consensus tree
     # we need to run dnadist or protdist, depending on runmode
     if [ "$runmode" -eq 1 ]
     then
	 echo "# running write_dnadist_params $model $boot $TiTv $sequential $gamma $CV"
    	 write_dnadist_params "$model" "$boot" "$TiTv" "$sequential" "$gamma" "$CV"
	 echo "# running dnadist < dnadist.params &> /dev/null"
    	 dnadist < dnadist.params &> /dev/null
	 check_output outfile
	 
	 # check that matrix does not contain negative values
	 check_matrix outfile
	 
	 mv outfile infile
     elif [ "$runmode" -eq 2 ]
     then
	 echo
	 echo "# running write_protdist_params $model $boot $sequential $gamma $CV"
    	 write_protdist_params "$model" "$boot" "$sequential" "$gamma" "$CV"
	 echo "# running protdist < protdist.params &> /dev/null"
    	 protdist < protdist.params &> /dev/null
	 check_output outfile
	 
	 # check that matrix does not contain negative values
	 check_matrix outfile

	 mv outfile infile
     fi

     # 3. Now we have the distance matrices and we can proceed equaly for runmodes 1 and 2
     #    >>> Run neighbor
     echo "# > Computing distance trees from bootstrapped data ..."	  
     echo "# running write_neighbor_params $boot $upgma $outgroup"
     write_neighbor_params "$boot" "$upgma" "$outgroup"
     echo "# running neighbor < neighbor.params &> /dev/null"
     neighbor < neighbor.params &> /dev/null
     check_output outtree
     
     boot_trees=
     if [ -s outtree ]
     then
	 # this is the file holding the trees for the n-distance matrices for n-boot replicated alignments
	 boot_trees="${input_phylip%.*}_${model}${gammaf}gamma_${boot}bootRepl_trees.nwk"
	 cp outtree "$boot_trees"
	 
	 # append to array with +=, otherwise will complain as unset with set -u 
	 # https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
	 outfiles+=("$boot_trees")
     fi
     mv outtree intree
     rm outfile

     # 4. Compute consensus tree with consense
     echo "# > Computing MJR consensus tree from trees reconstructed from bootstrap pseudoreplicates ..."
     echo "# running write_consense_params"
     write_consense_params
     echo "# running consense < consense.params &> /dev/null"
     consense < consense.params &> /dev/null
     check_output outtree
     check_output outfile
     
     # variables holding consensus trees
     upgma_consensus_tree=
     upgma_consensus_outfile=
     nj_consensus_tree=
     nj_consensus_outfile=

     if [ "$upgma" -gt 0 ]
     then
	 # https://fvue.nl/wiki/Bash:_Error_%60Unbound_variable%27_when_appending_to_empty_array
         # Set last item specifically
	 # instead of appending one element, set the last item specifically, without any "unbound variable" error
	 # t[${#t[*]}]=foo
	 upgma_consensus_tree="${input_phylip%.*}_UPGMAconsensus_${model}${gammaf}gamma_${boot}bootRepl.ph"
	 mv outtree "$upgma_consensus_tree"
	 outfiles[${#outfiles[*]}]="$upgma_consensus_tree"
    	 
	 upgma_consensus_outfile="${input_phylip%.*}_UPGMAconsensus_${model}${gammaf}gamma_${boot}bootRepl.outfile"
	 mv outfile "$upgma_consensus_outfile"
	 outfiles[${#outfiles[*]}]="$upgma_consensus_outfile"
     else
         nj_consensus_tree="${input_phylip%.*}_NJconsensus_${model}${gammaf}gamma_${boot}bootRepl.ph"
	 mv outtree "$nj_consensus_tree"
	 outfiles[${#outfiles[*]}]="$nj_consensus_tree"
    	 
	 nj_consensus_outfile="${input_phylip%.*}_NJconsensus_${model}${gammaf}gamma_${boot}bootRepl.outfile"
	 mv outfile "$nj_consensus_outfile"
	 outfiles[${#outfiles[*]}]="$nj_consensus_outfile"
     fi 

     # 5. Rename outtrees and tree outfiles; remap bootstrap values to bipartitions and display tree on screen
     if [ "$upgma" -gt 0 ]
     then
          # if we requested bootstrapping, map bootstrap values onto UPGMA tree using
          #   nw_support upgma.ph bootRepl_tree.ph > UPGMA_with_boot_support.ph
          if [ -s "$upgma_tree" ] && [ -s "$boot_trees" ] && [[ $(type -P nw_support) ]] && [ "$nw_utils_ok" -eq 1 ]
          then
	      upgma_tree_with_boot="${input_phylip%.*}_${model}${gammaf}gamma_UPGMA_with_${boot}boot_support.ph"
	      echo "# mapping bootstrap values on UPGMA tree with nw_support ..."
	      nw_support "$upgma_tree" "$boot_trees" > "$upgma_tree_with_boot"

	      if [ -s "$upgma_tree_with_boot" ] && [[ $(type -P nw_display) ]] && [ "$nw_utils_ok" -eq 1 ]
	      then
	          outfiles[${#outfiles[*]}]="$upgma_tree_with_boot"
	          
		  # check that there are no negative branch lengths in the nj_tree 
		  #   before displaying with nw_display
		  display_treeOK "$upgma_tree_with_boot"
	      fi
	  elif [ -s "$upgma_outfile" ] && [ -s "$upgma_consensus_outfile" ] && { [[ ! $(type -P nw_support) ]] || [ "$nw_utils_ok" -ne 1 ]; }
	  then
               echo "# extract_tree_from_outfile $upgma_outfile"
	       echo
	       extract_tree_from_outfile "$upgma_outfile"
	       echo
	       echo "# extract_tree_from_outfile $upgma_consensus_outfile"
	       echo
	       extract_tree_from_outfile "$upgma_consensus_outfile"
	       echo
	  fi
     else
         # if we requested bootstrapping, map bootstrap values onto NJ tree using
         #   nw_support NJ.ph bootRepl_tree.ph > NJ_with_boot_support.ph
         if [ -s "$nj_tree" ] && [ -s "$boot_trees" ] && [[ $(type -P nw_support) ]] && [ "$nw_utils_ok" -eq 1 ]
         then
             nj_tree_with_boot="${input_phylip%.*}_${model}${gammaf}gamma_NJ_with_${boot}boot_support.ph"
	     echo "# mapping bootstrap values on NJ tree with nw_support ..."
	     nw_support "$nj_tree" "$boot_trees" > "$nj_tree_with_boot"
	 
	     check_output "$nj_tree_with_boot"
	 
	     if [ -s "$nj_tree_with_boot" ] && [[ $(type -P nw_display) ]] && [ "$nw_utils_ok" -eq 1 ]
	     then
	         outfiles+=("$nj_tree_with_boot")
	         
		 # check that there are no negative branch lengths in the nj_tree 
		 #   before displaying with nw_display
		 display_treeOK "$nj_tree_with_boot"
	     fi
    	 elif [ -s "$nj_outfile" ] && [ -s "$nj_consensus_outfile" ] && { [[ ! $(type -P nw_support) ]] || [ "$nw_utils_ok" -ne 1 ]; }
	 then

                echo "# extract_tree_from_outfile $nj_outfile"
	        echo
	        extract_tree_from_outfile "$nj_outfile"
		echo

		echo "# extract_tree_from_outfile $nj_consensus_outfile"
		echo
		extract_tree_from_outfile "$nj_consensus_outfile"
		echo
         fi
    fi
fi

# 6. Tidy up: remove the *params files and other temporary files 
# that could interfere with future runs and litter the directory

[ "$DEBUG" -eq 0 ] && cleanup_dir

# 7. Print final output summary message
echo
echo '===================== OUTPUT SUMMARY ====================='

no_outfiles=${#outfiles[@]}
echo "# $no_outfiles output files were generated:"
printf "%s\n" "${outfiles[@]}"

echo
echo -n "# FINISHED run at: "; date 
echo "   ==> Exiting now ..."
echo