#!/bin/bash ### genEra v1.4.1 (C) Max Planck Society for the Advancement of Science ### ### Code developed by Josue Barrera-Redondo ### ### This program is free software: you can redistribute it and/or modify ### it under the terms of the GNU General Public License as published by ### the Free Software Foundation, either version 3 of the License, or ### (at your option) any later version. ### ### This program is distributed in the hope that it will be useful, ### but WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ### GNU General Public License for more details. VERSION='1.4.1' QUERY_FASTA='' QUERY_PDB='' NCBITAX='' NR_DB='' ALPHAFOLD_DB='' THREADS=$(nproc) THRESHOLD='30' DIAMONDOUT='' TAXDUMP='' RAWTAX='' CUSTOMTAX='' CUSTOMDATA='' CUSTOM_PDB='' CUSTOMGENOMES='' EVALUE='1e-5' FOLDSEEK_MAXSEQS='10000' DIAMONDOPTS='' MINHITS='10' PRINTOLDEST='false' DIVERGENCE='' OUTDIR=$(pwd) TMP_PATH='' SENSITIVITY='sensitive' JACKHMMER='false' HMMERTEST='2' STRAINS='' INPUT_TREE='' FAST_STEP3='true' print_usage() { printf "\ngenEra v${VERSION} (C) Max Planck Society for the Advancement of Science\n\n BASIC USAGE\n\tgenEra -q [query_sequences.fasta] -t [query_taxid] -b [path/to/nr] -d [path/to/taxdump]\n OR\n\tgenEra -Q [query_structure_directory] -t [query_taxid] -B [path/to/AlphaFold_DB] -d [path/to/taxdump]\n\n DESCRIPTION\n\tgenEra is an easy-to-use, low-dependency command-line tool that\n\testimates the age of the earliest common ancestor of protein\n\tcoding genes though genomic phylostratigraphy.\n\n MANDATORY ARGUMENT ALWAYS\n\t-t\tNCBI Taxonomy ID of query species (search for the taxid of\n\t\tyour query species at https://www.ncbi.nlm.nih.gov/taxonomy)\n\n OPTIONS FOR MANDATORY INPUT FILE\n\t-q\tFile with query protein sequences in FASTA format to perform\n\t\tpairwise sequence alignments using DIAMOND\n\t-Q\tDirectory with query structural predictions in PDB format\n\t\tto perform pairwise structural alignments using Foldseek\n\n OPTIONS FOR MANDATORY DATABASE\n\t-b\tPath to a locally installed database for DIAMOND (FASTA only)\n\t-B\tPath to a locally installed database for Foldseek (PDB only)\n\t-p\tPre-generated DIAMOND/Foldseek table (skip step 1), with the\n\t\tquery genes in the first column, the bitscore in the second\n\t\tto last column and the target taxid in the last column\n\t\t(IMPORTANT: the query sequences must also be searched\n\t\tagainst themselves for genEra to work properly)\n\n OPTIONS FOR MANDATORY TAXONOMY FILE\n\t-d\tLocation of the uncompressed taxonomy dump from the NCBI\n\t\t(ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)\n\t-r\tRaw \"ncbi_lineages\" file generated by ncbitax2lin\n\t\t(saves time on step 2)\n\t-c\tCustom \"ncbi_lineages\" file that is already tailored for the\n\t\tquery species (skip step 2). The table should be arranged so\n\t\tthat the taxid is in the first column and all the phylostrata\n\t\tof interest are organized from the species level all the way\n\t\tback to \"cellular organisms\"\n\n IMPORTANT OPTIONAL ARGUMENTS\n\t-n\tNumber of threads to run genEra (genEra can run with a single\n\t\tthread, but it is HIGHLY suggested to use as many threads as\n\t\tpossible) (default: all available threads)\n\t-o\tDirectory to save output files (default: working directory)\n\t-v\tList with protein FASTA files to perform an infraspecies-level\n\t\tanalysis against the query proteome. Particularly useful for\n\t\tdetecting candidate de novo genes. The user can give GenEra a\n\t\tprecomputed phylogeny with -z or let GenEra estimate the\n\t\tevolutionary relationship between the listed proteomes using\n\t\tOrthoFinder. Make sure the file names are unique and recognizable.\n\t-z\tPrecomputed phylogeny in NEWICK format for the infraspecies-level\n\t\tanalysis. Make sure that the names in the phylogeny match the\n\t\tprefixes of the FASTA files (-v).\n\t-s\tTable with pairwise evolutionary distances (substitutions/site)\n\t\tbetween several species in the database and the query species\n\t\t(necessary to calculate homology detection failure probabilities\n\t\twith abSENSE). NOTE: the query species SHOULD be included in\n\t\tthis table. The table should be tab-delimited and have the\n\t\tfollowing format:\n\t\t query_sp_taxid\t0\n\t\t species_1_taxid\tdistance_1\n\t\t species_2_taxid\tdistance_2\n\t-a\tTable with additional proteins to be included in the analysis\n\t\t(e.g., proteins from species that are absent from the nr).\n\t\tUse this option when using FASTA sequences as input.\n\t\tThe table should be tab-delimited and have the following format:\n\t\t /path/to/species_1.fasta\ttaxid_1\n\t\t /path/to/species_2.fasta\ttaxid_2\n\t\t /path/to/species_3.fasta\ttaxid_3\n\t-f\tTable with additional nucleotide sequences to search against\n\t\tyour query proteins. Particularly useful with genome assemblies\n\t\tfor improved orphan gene classification. Use this option when\n\t\tusing FASTA sequences as input. The table should be\n\t\ttab-delimited and have the following format:\n\t\t /path/to/genome_1.fasta\ttaxid_1\n\t\t /path/to/genome_2.fasta\ttaxid_2\n\t\t /path/to/genome_3.fasta\ttaxid_3\n\t-A\tTable with additional structural predictions to be included\n\t\tin the analysis (e.g., structural predictions from species\n\t\tthat are absent from the AlphaFold DB). Use this option when\n\t\tusing PDB files as input. The table should be tab-delimited\n\t\tand have the following format:\n\t\t /path/to/species_directory_1\ttaxid_1\n\t\t /path/to/species_directory_2\ttaxid_2\n\t\t /path/to/species_directory_3\ttaxid_3\n\t-i\tWhen true, prints an additional output file with the best\n\t\tsequence hit responsible for the oldest gene age assignment\n\t\tfor each of the query genes (default: true)\n\t-F\tSpeed up STEP 3 gen age assignment at the cost of using\n\t\tmore RAM, e.g 200 GB RAM for 180 GB STEP 1 .bout output,\n\t\tand temporary files, as much temp files as the number of\n\t\tquery genes. Please take into account the requeriments before\n\t\trunning. Set -F false to run genEra with fewer resources at the\n\t\tcost of longer running times. Speed up of x ~ 5 times faster per\n\t\t10,000 query genes (-F true vs -F false). Speed improvement\n\t\twill increase gradually with bigger queries. (default: true)\n\n FINE-TUNNING ARGUMENTS (DEFAULT IS USUALLY FINE)\n\t-l\tTaxonomic representativeness threshold below which a gene will\n\t\tbe flagged as putative genome contamination or the product of\n\t\ta horizontal gene transfer (HGT) event (default: 30)\n\t-e\tE-value threshold for DIAMOND and Foldseek (default: 1e-5)\n\t-u\tAdditional options to feed DIAMOND or Foldseek, based on user\n\t\tpreferences (filtering hits by identity or query coverage)\n\t\tUsers should input the additional commands in quotes, using\n\t\tthe original arguments from DIAMOND (Example: -o \"--id 30\")\n\t\tor from Foldseek (Example: -o \"-c 0.8 --cov-mode 0\")\n\t-m\tMinimum percentage of matches between your query sequences\n\t\tand any species within a taxonomic level to consider it useful\n\t\tfor the gene age assignment (i.e., filtering taxonomic levels\n\t\tlacking complete genomes in the database) (default: 10)\n\t-x\tAlternative path where you would like to store the temporary\n\t\tfiles as well as the DIAMOND/Foldseek results (warning: genEra\n\t\twill generate HUGE temporary files) (default: the files will\n\t\tbe stored in a tmp_[RAMDOMNUM]/ directory created by genEra)\n\t-y\tModify the sensitivity parameter in DIAMOND for faster\n\t\tor more sensitive results in step 1 (default: sensitive)\n\t-M\tAdjust the amount of prefilter made by Foldseek (i.e., the\n\t\tmaximum number of hits that are reported) (default: 10000)\n\t-j\tWhen true, run an additional search step using jackhmmer\n\t\tagainst the online Uniprot database (default: false)\n\t\t(warning 1: Only available when using FASTA sequences)\n\t\t(warning 2: this step requires internet connection)\n\t-k\tStarting taxonomic rank of genes that will be re-analyzed\n\t\tusing jackhmmer (default: 2)\n\t-h\tPrint this help message and exit\n\n" } while getopts ':q:Q:t:b:B:n:l:p:d:r:c:a:A:f:e:u:m:M:i:s:o:x:y:j:k:v:z:F:h' flag; do case "${flag}" in q) QUERY_FASTA=${OPTARG} ;; Q) QUERY_PDB=${OPTARG} ;; t) NCBITAX=${OPTARG} ;; b) NR_DB=${OPTARG} ;; B) ALPHAFOLD_DB=${OPTARG} ;; n) THREADS=${OPTARG} ;; l) THRESHOLD=${OPTARG} ;; p) DIAMONDOUT=${OPTARG} ;; d) TAXDUMP=${OPTARG} ;; r) RAWTAX=${OPTARG} ;; c) CUSTOMTAX=${OPTARG} ;; a) CUSTOMDATA=${OPTARG} ;; A) CUSTOM_PDB=${OPTARG} ;; f) CUSTOMGENOMES=${OPTARG} ;; e) EVALUE=${OPTARG} ;; u) DIAMONDOPTS=${OPTARG} ;; m) MINHITS=${OPTARG} ;; M) FOLDSEEK_MAXSEQS=${OPTARG} ;; i) PRINTOLDEST=${OPTARG} ;; s) DIVERGENCE=${OPTARG} ;; o) OUTDIR=${OPTARG} ;; x) TMP_PATH=${OPTARG} ;; y) SENSITIVITY=${OPTARG} ;; j) JACKHMMER=${OPTARG} ;; k) HMMERTEST=${OPTARG} ;; v) STRAINS=${OPTARG} ;; z) INPUT_TREE=${OPTARG} ;; F) FAST_STEP3=${OPTARG} ;; h) print_usage ; exit;; *) echo ; echo " ERROR: One or more invalid arguments"; print_usage ; exit;; esac done # Check if Erassingnation.sh was added to the PATH PHYLOSCRIPT=$(which Erassignment) if [[ -z ${PHYLOSCRIPT} ]]; then echo echo " ERROR: Please make sure that Erassignment is located in your PATH" echo exit 1 fi # Check if mcl is correctly installed MCLEXEC=$(which mcl) MCXLOADEXEC=$(which mcxload) MCXDUMPEXEC=$(which mcxdump) if [[ -z ${MCLEXEC} ]] || [[ -z ${MCXLOADEXEC} ]] || [[ -z ${MCXDUMPEXEC} ]]; then echo echo " ERROR: Please make sure that mcl, mcxload and mcxdump are located in your PATH" echo exit 1 fi # If using a FASTA file, check if DIAMOND is correctly installed and with version 2 onwards if [[ -f $QUERY_FASTA ]]; then DIAMONDEXEC=$(which diamond) if [[ -z ${DIAMONDEXEC} ]]; then echo echo " ERROR: Please make sure that diamond is located in your PATH" echo exit 1 fi DIAMONDVERSION=$(diamond help | head -1 | cut -d' ' -f2 | cut -d. -f1 | sed 's/v//g') if [[ ${DIAMONDVERSION} -lt 2 ]]; then echo echo " ERROR: The installed version of DIAMOND is outdated" echo " Please update DIAMOND to version 2.0.0 or higher" echo exit 1 fi fi # Check if Foldseek is correctly installed if [[ -d $QUERY_PDB ]]; then FOLDSEEKEXEC=$(which foldseek) if [[ -z ${FOLDSEEKEXEC} ]]; then echo echo " ERROR: Please make sure that foldseek is located in your PATH" echo exit 1 fi fi # If running the infraspecies-level analysis, make sure tree2ncbitax and OrthoFinder are installed if [[ -f $STRAINS ]]; then ORTHOFINDEREXEC=$(which orthofinder) TREESCRIPT=$(which tree2ncbitax) if [[ -z ${TREESCRIPT} ]]; then echo echo " ERROR: The infraspecies-level analysis was invoked (-v), but tree2ncbitax in missing. Make sure that tree2ncbitax is located in your PATH" echo exit 1 fi if [ ! "$INPUT_TREE" ]; then if [[ -z ${ORTHOFINDEREXEC} ]]; then echo echo "ERROR: The infraspecies-level analysis was invoked without an specified tree, but orthofinder is missing. Make sure that orthofinder is located in your PATH, or feed genEra with a pre-computed phylogeny (-z) for the analysis" echo exit 1 fi fi fi # If FAST_STEP3 is turned true check if FASTSTEP3R scripts is added to the PATH if [[ "${FAST_STEP3}" = "true" ]]; then F3R=$(which FASTSTEP3R) if [[ -z ${F3R} ]]; then echo echo " ERROR: Please make sure that FASTSTEP3R script is located in your PATH" echo exit 1 fi fi # Check if the mandarory arguments were used if [ ! "$QUERY_FASTA" ] && [ ! "$QUERY_PDB" ]; then echo echo " ERROR: The user must provide either a protein multi-FASTA file (-q) or a directory containing protein PDB files (-Q)" print_usage >&2; exit 1 fi if [ ! "$NCBITAX" ]; then echo echo " ERROR: The user must provide a NCBI taxonomy ID of the query species (-t)" print_usage >&2; exit 1 fi # Avoid using FASTA and PDB files at the same time if [[ -f $QUERY_FASTA ]] && [[ -d $QUERY_PDB ]]; then echo echo " ERROR: The user must NOT provide a FASTA file (-q) and a directory with PDB files (-Q) at the same time" echo " Choose one or the other" print_usage >&2; exit 1 fi # Quick check to verify that the input sequences exist if [ "$QUERY_FASTA" ]; then if [[ ! -f $QUERY_FASTA ]]; then echo echo " ERROR: The query FASTA file ${QUERY_FASTA} does not exist" print_usage >&2; exit 1 fi fi if [ "$QUERY_PDB" ]; then if [[ ! -d $QUERY_PDB ]]; then echo echo " ERROR: The query directory with PDB files ${QUERY_PDB} does not exist" print_usage >&2; exit 1 fi PDBSIZE=$(ls -l ${QUERY_PDB} | head -1 | cut -d " " -f2) if [ $PDBSIZE -eq 0 ]; then echo echo " ERROR: The query directory ${QUERY_PDB} is empty" print_usage >&2; exit 1 fi fi # Check whether the user specified the other mandarory arguments if [ "$QUERY_FASTA" ]; then if [ -z "$NR_DB" ] && [ -z "$DIAMONDOUT" ]; then echo echo " ERROR: The user must provide either the path to a local DIAMOND database (-b) or a pre-generated DIAMOND/Foldseek table (-p)" print_usage >&2; exit 1 fi fi if [ "$QUERY_PDB" ]; then if [ -z "$ALPHAFOLD_DB" ] && [ -z "$DIAMONDOUT" ]; then echo echo " ERROR: The user must provide either the path to a local Foldseek database (-b) or a pre-generated DIAMOND/Foldseek table (-p)" print_usage >&2; exit 1 fi fi if [ -z "$TAXDUMP" ] && [ -z "$RAWTAX" ] && [ -z "$CUSTOMTAX" ]; then echo echo " ERROR: One of the following arguments must be provided (-d, -r or -c)" print_usage >&2; exit 1 fi # If homology detection failure is invoked, make sure that abSENSE is installed and that the input file exists if [[ -f ${DIVERGENCE} ]]; then ABSENSE_SCRIPT=$(which Run_abSENSE.py) if [[ -z ${ABSENSE_SCRIPT} ]]; then echo echo " ERROR: Run_abSENSE.py was invoked (-s), but the script is not located in your PATH" echo " Please make sure to install it correctly" echo exit 1 fi if [[ ! -f ${DIVERGENCE} ]]; then echo echo " ERROR: genEra could not find the file with the pairwise evolutionary distances:" echo " ${DIVERGENCE}" echo " Exiting" exit 1 fi fi echo "genEra v${VERSION} (C) Max Planck Society for the Advancement of Science" echo "Starting time of run:" date # Create output directory if it doesn't exist if [[ ! -d ${OUTDIR} ]]; then mkdir -p ${OUTDIR} fi # create a temporary directory if no specific path is given by the user if [[ -z ${TMP_PATH} ]]; then RANDOMSEED=$(echo $RANDOM) mkdir tmp_${NCBITAX}_${RANDOMSEED} && cd tmp_${NCBITAX}_${RANDOMSEED} TMP_PATH=$(pwd) cd ../ else RANDOMSEED=$(echo $RANDOM) RUNNINGPATH=$(pwd) cd ${TMP_PATH} && mkdir tmp_${NCBITAX}_${RANDOMSEED} && cd tmp_${NCBITAX}_${RANDOMSEED} TMP_PATH=$(pwd) cd ${RUNNINGPATH} fi echo echo "Your temporary files will be stored in ${TMP_PATH}" # Check whether the FASTA sequences are gzipped or not if [[ -f ${QUERY_FASTA} ]]; then FASTACHECK=$(file ${QUERY_FASTA} | cut -d " " -f2) if [[ ${FASTACHECK} == "symbolic" ]]; then QUERY_REALPATH=$(ls -l ${QUERY_FASTA} | awk '{ print $11 }') QUERY_FASTA="${QUERY_REALPATH}" FASTACHECK=$(file ${QUERY_FASTA} | cut -d " " -f2) fi if [[ ${FASTACHECK} == "gzip" ]]; then FASTA_PREFIX=$(echo "${QUERY_FASTA##*/}" | cut -d. -f1) zcat < ${QUERY_FASTA} > ${TMP_PATH}/${FASTA_PREFIX}.fasta QUERY_FASTA="${TMP_PATH}/${FASTA_PREFIX}.fasta" fi QUERYSIZE1=$(grep -c ">" ${QUERY_FASTA}) QUERYSIZE2=$(wc -l ${QUERY_FASTA} | cut -d" " -f1) if [ "$QUERYSIZE1" -lt 1 ] || [ "$QUERYSIZE2" -lt 2 ]; then echo echo " ERROR: The protein sequences in ${QUERY_FASTA} are not in FASTA format, or the file is empty" print_usage >&2; exit 1 fi fi ####################################################### ### GenEra: STEP 1 ### ####################################################### if [ -n "$DIAMONDOUT" ]; then # Check that the pre-generated DIAMOND file exists if [[ ! -f ${DIAMONDOUT} ]]; then echo echo " ERROR: genEra could not find the pre-generated DIAMOND/Foldseek file:" echo " ${DIAMONDOUT}" echo " Exiting" exit 1 fi # Check that the pre-generated DIAMOND file is not empty if [ -s ${DIAMONDOUT} ]; then if [[ -f ${QUERY_FASTA} ]]; then echo echo "DIAMOND OUTPUT ALREADY GENERATED. SKIPPING STEP 1" echo echo "We're just going to quickly cluster the query genes against themselves for later on (step 3)" # Generate DIAMOND database for self-match diamond makedb \ --in ${QUERY_FASTA} \ --db ${TMP_PATH}/tmp_selfmatch_db --quiet # Create a self-match table for the clustering analysis diamond blastp --${SENSITIVITY} \ --query ${QUERY_FASTA} \ --db ${TMP_PATH}/tmp_selfmatch_db \ --outfmt 6 qseqid sseqid evalue bitscore \ --evalue ${EVALUE} --max-target-seqs 0 \ --threads ${THREADS} --out ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --quiet ${DIAMONDOPTS} # Create input file for MCL cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc # Remove temporary files rm ${TMP_PATH}/tmp_selfmatch_db* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout elif [[ -d ${QUERY_PDB} ]]; then echo echo "FOLDSEEK OUTPUT ALREADY GENERATED. SKIPPING STEP 1" echo echo "We're just going to quickly cluster the query genes against themselves for later on (step 3)" # Generate Foldseek database for self-match foldseek createdb ${QUERY_PDB} ${TMP_PATH}/tmp_${NCBITAX}_DB -v 0 foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS} foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --format-output "query,target,evalue,bits" -v 0 # Create input file for MCL cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc # Remove temporary files rm ${TMP_PATH}/tmp_${NCBITAX}_DB* ${TMP_PATH}/tmp_${NCBITAX}_aln* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout rm -r ${TMP_PATH}/tmp_${NCBITAX}_foldseek/ fi # If pre-generated DIAMOND file is empty, show error mesage and exit else echo echo " ERROR: The pre-generated DIAMOND/MMseqs2 file exists, but it is empty" echo " Exiting" echo exit 1 fi elif [[ -f ${QUERY_FASTA} ]]; then if [[ ! -f ${NR_DB}.dmnd ]]; then echo echo " ERROR: genEra could not find the locally installed database for DIAMOND:" echo " ${NR_DB}.dmnd" echo " Please refer to the manual for information on how to generate such database" echo " Exiting" exit 1 fi echo echo "STARTING STEP 1: SEARCHING FOR HOMOLOGS WITHIN THE DATABASE USING DIAMOND" # Check whether DIAMOND can be found by genEra DIAMONDPATH=$(which diamond) if [[ -z ${DIAMONDPATH} ]]; then echo echo " ERROR: genEra could not find the diamond excecutable. Unable to run step 1" echo " Please make sure that DIAMOND is properly installed and located in the PATH" echo " Exiting" exit 1 fi # Start with the infraspecies analysis (in case the user gives the necessary input) if [[ -f ${STRAINS} ]]; then echo "--------------------------------------------------" echo "Infraspecies-level analysis was invoked!" echo "Adding infraspecies nodes to the analysis based on a phylogenetic tree" if [[ -f ${INPUT_TREE} ]]; then echo "The phylogenetic tree was provided by the user, skipping the phylogenetic inference" else echo "Running OrthoFinder to predict the phylogenetic relationships between genomes" # Create a temporary directory for the orthofinder analysis mkdir ${TMP_PATH}/orthofinder # Copy the query FASTA file to the orthofinder directory cp ${QUERY_FASTA} ${TMP_PATH}/orthofinder/ # Loop over every FASTA file IFS=$'\n' for STRAIN_FASTA in $(cat ${STRAINS}); do FASTACHECK=$(file ${STRAIN_FASTA} | cut -d " " -f2) # Check that the file is not a symbolic link if [[ ${FASTACHECK} == "symbolic" ]]; then FASTA_REALPATH=$(ls -l ${STRAIN_FASTA} | awk '{ print $11 }') STRAIN_FASTA="${FASTA_REALPATH}" FASTACHECK=$(file ${STRAIN_FASTA} | cut -d " " -f2) fi # Extract the prefix of every FASTA file STRAIN_PREFIX=$(echo "${STRAIN_FASTA##*/}" | cut -d. -f1) # Copy each FASTA file to the orthofinder directory if [[ ${FASTACHECK} == "gzip" ]]; then zcat < ${STRAIN_FASTA} > ${TMP_PATH}/orthofinder/${STRAIN_PREFIX}.fasta else cat ${STRAIN_FASTA} > ${TMP_PATH}/orthofinder/${STRAIN_PREFIX}.fasta fi done #Run Orthofinder to obtain a phylogenetic tree orthofinder -t ${THREADS} -a ${THREADS} -f ${TMP_PATH}/orthofinder -o ${TMP_PATH}/${NCBITAX}_orthofinder_results > ${TMP_PATH}/orthofinder_log_${NCBITAX} # Save the Orthofinder rooted tree and dispose of the orthofinder input files in tmp cp ${TMP_PATH}/${NCBITAX}_orthofinder_results/Results_*/Species_Tree/SpeciesTree_rooted.txt ${OUTDIR}/${NCBITAX}_orthofinder_tree.nwk INPUT_TREE="${OUTDIR}/${NCBITAX}_orthofinder_tree.nwk" echo "--------------------------------------------------" echo "The phylogenetic relationships of the genomes can be found in ${INPUT_TREE}" rm -fr ${TMP_PATH}/orthofinder/ echo "The raw OrthoFinder results can be found in ${TMP_PATH}/${NCBITAX}_orthofinder_results" fi echo "--------------------------------------------------" echo "Creating a taxonomy file with the infraspecies-level nodes from the phylogeny" TARGET_GENOME=$(echo "${QUERY_FASTA##*/}" | cut -d. -f1) tree2ncbitax -i ${INPUT_TREE} -g ${TARGET_GENOME} -t ${NCBITAX} -o ${TMP_PATH}/${TARGET_GENOME}.csv # Include the additional proteomes to the analysis if [ ! "$CUSTOMDATA" ]; then CUSTOMDATA="${TMP_PATH}/customdata.tsv" fi awk -F "," '{ print $2"\t"$1 }' ${TMP_PATH}/${TARGET_GENOME}.csv > ${TMP_PATH}/tmp_strainIDs.tsv while read -r file; do STRAIN_PREFIX=$(echo "${file##*/}" | cut -d. -f1) STRAIN_NUM=$(grep "^$STRAIN_PREFIX"$'\t' ${TMP_PATH}/tmp_strainIDs.tsv | cut -f 2) if [ -n "$STRAIN_NUM" ]; then echo -e "${file}\t${STRAIN_NUM}" >> "${CUSTOMDATA}" fi done < "${STRAINS}" echo "The additional genomes will be included in the analysis through the file ${CUSTOMDATA}" rm ${TMP_PATH}/tmp_strainIDs.tsv fi echo "--------------------------------------------------" echo "Matching the query genes against themselves" # Generate DIAMOND database for self-match diamond makedb \ --in ${QUERY_FASTA} \ --db ${TMP_PATH}/tmp_selfmatch_db --quiet # Create a self-match table to find orphan genes and for the clustering analysis diamond blastp --${SENSITIVITY} \ --query ${QUERY_FASTA} \ --db ${TMP_PATH}/tmp_selfmatch_db \ --outfmt 6 qseqid sseqid evalue bitscore \ --evalue ${EVALUE} --max-target-seqs 0 \ --threads ${THREADS} --out ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --quiet ${DIAMONDOPTS} # Create input file for MCL cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc # Add the self-matches to the homology table awk -v NCBITAX="$NCBITAX" '{ print $0"\t"NCBITAX }' ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/${NCBITAX}_Diamond_results.bout # Remove temporary files rm ${TMP_PATH}/tmp_selfmatch_db* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout echo "--------------------------------------------------" echo "Searching for homologs against the DIAMOND database" # Search for hits between the query species and the DIAMOND database diamond blastp --${SENSITIVITY} \ --query ${QUERY_FASTA} \ --db ${NR_DB} \ --outfmt 6 qseqid sseqid evalue bitscore staxids \ --evalue ${EVALUE} --max-target-seqs 0 \ --threads ${THREADS} --out ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout --quiet ${DIAMONDOPTS} # Verify that DIAMOND ran correctly, otherwise abort and show error if [[ -s ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout ]]; then echo "DIAMOND results written!" else echo echo " ERROR: DIAMOND didn't run properly, verify that the database was built correctly" echo " Exiting" exit 1 fi # Remove hits that do not have any associated taxids awk '$5!=""' ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout rm ${TMP_PATH}/${NCBITAX}_Diamond_prefiltered_results.bout # Look for homologs in a custom protein database if [ -n "$CUSTOMDATA" ]; then if [[ -f ${CUSTOMDATA} ]]; then echo "--------------------------------------------------" echo "Custom protein database found. Searching for homologs in the custom database using DIAMOND" # Generate a list of all gene names with their respective taxid, and generate a custom FASTA file with all custom sequences IFS=$'\n' for SPECIES in $(cat ${CUSTOMDATA}); do SP_FASTA=$(echo $SPECIES | cut -d " " -f1) SP_TAXID=$(echo $SPECIES | cut -d " " -f2) FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2) if [[ ${FASTACHECK} == "symbolic" ]]; then FASTA_REALPATH=$(ls -l ${SP_FASTA} | awk '{ print $11 }') SP_FASTA="${FASTA_REALPATH}" FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2) fi if [[ ${FASTACHECK} == "gzip" ]]; then zcat < ${SP_FASTA} > ${TMP_PATH}/tmp_${SP_TAXID}_proteins ${SP_FASTA}="${TMP_PATH}/tmp_${SP_TAXID}_proteins" fi grep ">" ${SP_FASTA} | awk '{ print $1 }' | sed 's/>//g' | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_custom_taxid_table cat ${SP_FASTA} >> ${TMP_PATH}/tmp_custom_database.fasta if [[ -f ${TMP_PATH}/tmp_${SP_TAXID}_proteins ]]; then rm ${TMP_PATH}/tmp_${SP_TAXID}_proteins fi done # Generate DIAMOND custom database diamond makedb \ --in ${TMP_PATH}/tmp_custom_database.fasta \ --db ${TMP_PATH}/tmp_custom_database --quiet # Search for hits against the custom database diamond blastp --${SENSITIVITY} \ --query ${QUERY_FASTA} \ --db ${TMP_PATH}/tmp_custom_database \ --outfmt 6 qseqid sseqid evalue bitscore \ --evalue ${EVALUE} --max-target-seqs 0 \ --threads ${THREADS} --out ${TMP_PATH}/${NCBITAX}_custom_DB.bout --quiet ${DIAMONDOPTS} # Add the taxids to the results of the custom database and merge them with the nr results awk '{ print $2 }' ${TMP_PATH}/${NCBITAX}_custom_DB.bout > ${TMP_PATH}/tmp_sseqids awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_custom_taxid_table ${TMP_PATH}/tmp_sseqids > ${TMP_PATH}/tmp_sseqids_taxids awk '{ print $2 }' ${TMP_PATH}/tmp_sseqids_taxids | paste -d "\t" ${TMP_PATH}/${NCBITAX}_custom_DB.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout rm ${TMP_PATH}/tmp_custom_taxid_table ${TMP_PATH}/tmp_custom_database* ${TMP_PATH}/tmp_sseqids ${TMP_PATH}/tmp_sseqids_taxids ${TMP_PATH}/${NCBITAX}_custom_DB.bout else # In case the file does not exist, skip this step echo "--------------------------------------------------" echo " ERROR: The user specified a custom protein database, but the following file does not exist:" echo " ${CUSTOMDATA}" echo " genEra will continue without including the custom protein database" fi fi if [ -n "$CUSTOMGENOMES" ]; then if [[ -f ${CUSTOMGENOMES} ]]; then echo "--------------------------------------------------" echo "Custom nucleotide database found. Searching proteins against nucleotides using MMseqs2" # Check whether MMseqs2 can be found by genEra (This will not result in exit 1) MMSEQSPATH=$(which mmseqs) if [[ -z ${MMSEQSPATH} ]]; then echo echo " ERROR: genEra could not find mmseqs in the PATH" echo " Please make sure to install MMseqs2 before using a custom nucleotide database" fi # Generate a list of all the genome contigs with their respective taxid, and generate a custom FASTA file with all the custom genomes IFS=$'\n' for SPECIES in $(cat ${CUSTOMGENOMES}); do SP_FASTA=$(echo $SPECIES | cut -d " " -f1) SP_TAXID=$(echo $SPECIES | cut -d " " -f2) FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2) if [[ ${FASTACHECK} == "symbolic" ]]; then FASTA_REALPATH=$(ls -l ${SP_FASTA} | awk '{ print $11 }') SP_FASTA="${FASTA_REALPATH}" FASTACHECK=$(file ${SP_FASTA} | cut -d " " -f2) fi if [[ ${FASTACHECK} == "gzip" ]]; then zcat < ${SP_FASTA} > ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides SP_FASTA="${TMP_PATH}/tmp_${SP_TAXID}_nucleotides" fi grep ">" ${SP_FASTA} | awk '{ print $1 }' | sed 's/>//g' | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_custom_genome_taxid_table cat ${SP_FASTA} >> ${TMP_PATH}/tmp_custom_genome_database.fasta if [[ -f ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides ]]; then rm ${TMP_PATH}/tmp_${SP_TAXID}_nucleotides fi done mkdir ${TMP_PATH}/tmp_mmseqs # Create databases for mmseqs mmseqs createdb ${QUERY_FASTA} ${TMP_PATH}/tmp_${NCBITAX}_DB --dbtype 1 > mmseqs2_log_${NCBITAX} mmseqs createdb ${TMP_PATH}/tmp_custom_genome_database.fasta ${TMP_PATH}/tmp_genome_DB --dbtype 2 >> mmseqs2_log_${NCBITAX} # Search the query proteins against the genome assemblies in a "tblastn" fashion mmseqs search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_genome_DB ${TMP_PATH}/tmp_aln ${TMP_PATH}/tmp_mmseqs --threads ${THREADS} -e ${EVALUE} --split-memory-limit 100G -s 7.5 >> mmseqs2_log_${NCBITAX} mmseqs convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_genome_DB ${TMP_PATH}/tmp_aln ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout --format-output "query,target,evalue,bits" >> mmseqs2_log_${NCBITAX} awk '{ print $2 }' ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout > ${TMP_PATH}/tmp_genome_sseqids awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_custom_genome_taxid_table ${TMP_PATH}/tmp_genome_sseqids > ${TMP_PATH}/tmp_genome_sseqids_taxids awk '{ print $2 }' ${TMP_PATH}/tmp_genome_sseqids_taxids | paste -d "\t" ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Diamond_results.bout rm ${TMP_PATH}/tmp_custom_genome_taxid_table ${TMP_PATH}/tmp_custom_genome_database.fasta ${TMP_PATH}/tmp_${NCBITAX}_DB* ${TMP_PATH}/tmp_genome_DB* ${TMP_PATH}/tmp_aln* ${TMP_PATH}/tmp_genome_sseqids_taxids ${TMP_PATH}/tmp_genome_sseqids ${TMP_PATH}/${NCBITAX}_custom_genome_DB.bout rm -r ${TMP_PATH}/tmp_mmseqs/ else echo "--------------------------------------------------" echo " ERROR: The user specified a custom nucleotide database, but the following file does not exist:" echo " ${CUSTOMGENOMES}" echo " genEra will continue without including the custom nucleotide database" fi fi DIAMONDOUT="${TMP_PATH}/${NCBITAX}_Diamond_results.bout" # Check that the output file of STEP 1 is not empty if [ -s ${DIAMONDOUT} ]; then echo "--------------------------------------------------" echo "Step 1 finished!" echo "The DIAMOND/MMseqs2 table can be found in ${DIAMONDOUT}" echo "This file is usually HUGE, please dispose of it if you no longer find it useful" echo "It can still be used (-p) in case the user wants to re-run genEra while skipping step 1" # Otherwise, display error and exit else echo echo " ERROR: The DIAMOND/MMseqs2 table is empty. Please check that DIAMOND and/or the target database were correctly installed" echo " Exiting" echo exit 1 fi elif [[ -d $QUERY_PDB ]]; then if [[ ! -f ${ALPHAFOLD_DB} ]]; then echo echo " ERROR: genEra could not find the locally installed database for Foldseek:" echo " ${NR_DB}" echo " Please refer to the manual for information on how to download such database" echo " Exiting" exit 1 fi echo echo "STARTING STEP 1: SEARCHING FOR HOMOLOGS WITHIN THE DATABASE USING FOLDSEEK" # Check whether Foldseek can be found by genEra FOLDSEEKEXEC=$(which foldseek) if [[ -z ${FOLDSEEKEXEC} ]]; then echo echo " ERROR: genEra could not find the foldseek excecutable. Unable to run step 1" echo " Please make sure that Foldseek is properly installed and located in the PATH" echo " Exiting" exit 1 fi echo "--------------------------------------------------" echo "Matching the query genes against themselves" # Generate Foldseek database for the query proteins foldseek createdb ${QUERY_PDB} ${TMP_PATH}/tmp_${NCBITAX}_DB -v 0 # Create a self-match table to find orphan genes and for the clustering analysis foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS} foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout --format-output "query,target,evalue,bits" -v 0 # Create input file for MCL cut -f 1,2,3 ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/tmp_${NCBITAX}.abc # Add the self-matches to the homology table awk -v NCBITAX="$NCBITAX" '{ print $0"\t"NCBITAX }' ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout > ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout # Remove temporary files rm ${TMP_PATH}/tmp_${NCBITAX}_aln* ${TMP_PATH}/tmp_${NCBITAX}_self_matches.bout echo "--------------------------------------------------" echo "Searching for homologs against the Foldseek database" # Search for hits between the query species and the Foldseek database (e.g., AlphaFold UniProt) foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${ALPHAFOLD_DB} ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS} # Generate table format and remove hits that do not have any associated taxids foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${ALPHAFOLD_DB} ${TMP_PATH}/tmp_${NCBITAX}_aln ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout --format-output "query,target,evalue,bits,taxid" -v 0 awk '$5!=""' ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout >> ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout # Verify that Foldseek ran correctly, otherwise abort and show error if [[ -s ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout ]]; then echo "Foldseek results written!" else echo " ERROR: Foldseek didn't run properly, verify that the database was built correctly" echo " Verify that Foldseek didn't run out of RAM" echo " Try increasing the amount of RAM or lower the value of prefilter hits (-M)" echo " Alternatively, use DIAMOND" echo " Exiting" exit 1 fi rm ${TMP_PATH}/${NCBITAX}_Foldseek_prefiltered_results.bout ${TMP_PATH}/tmp_${NCBITAX}_aln* # Look for homologs in a custom PDB dataset if [ -n "$CUSTOM_PDB" ]; then if [[ -f ${CUSTOM_PDB} ]]; then echo "--------------------------------------------------" echo "Custom PDB database found. Searching for homologs in the custom database using Foldseek" echo "This will require considerable storage space in ${TMP_PATH}" # Generate a list of all gene names with their respective taxid, and generate a custom PDB directory with all the custom structural predictions mkdir ${TMP_PATH}/tmp_custom_PDB_DB WORKINGDIR=$(pwd) IFS=$'\n' for SPECIES in $(cat ${CUSTOM_PDB}); do SP_PDB=$(echo $SPECIES | cut -d " " -f1) SP_TAXID=$(echo $SPECIES | cut -d " " -f2) cd ${SP_PDB} ls | sed "s/$/\t${SP_TAXID}/" >> ${TMP_PATH}/tmp_PDB_taxid_table cp * ${TMP_PATH}/tmp_custom_PDB_DB/ cd ${WORKINGDIR} done # Generate Foldseek custom database foldseek createdb ${TMP_PATH}/tmp_custom_PDB_DB ${TMP_PATH}/tmp_custom_foldseek -v 0 rm -r ${TMP_PATH}/tmp_custom_PDB_DB/ # Search for hits against the custom database foldseek search ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_custom_foldseek ${TMP_PATH}/tmp_custom_aln ${TMP_PATH}/tmp_${NCBITAX}_foldseek -e ${EVALUE} --max-seqs ${FOLDSEEK_MAXSEQS} --threads ${THREADS} -v 0 ${DIAMONDOPTS} foldseek convertalis ${TMP_PATH}/tmp_${NCBITAX}_DB ${TMP_PATH}/tmp_custom_foldseek ${TMP_PATH}/tmp_custom_aln ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout --format-output "query,target,evalue,bits" -v 0 # Add the taxids to the results of the custom database and merge them with the AlphaFoldDB results awk '{ print $2 }' ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout > ${TMP_PATH}/tmp_PDBids awk 'NR==FNR{a[$1]=$2;next}{print $0,a[$1]}' ${TMP_PATH}/tmp_PDB_taxid_table ${TMP_PATH}/tmp_PDBids > ${TMP_PATH}/tmp_PDBids_taxids awk '{ print $2 }' ${TMP_PATH}/tmp_PDBids_taxids | paste -d "\t" ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout - | awk '$5!=""' >> ${TMP_PATH}/${NCBITAX}_Foldseek_results.bout rm ${TMP_PATH}/tmp_PDB_taxid_table ${TMP_PATH}/tmp_custom_foldseek* ${TMP_PATH}/tmp_PDBids ${TMP_PATH}/tmp_PDBids_taxids ${TMP_PATH}/tmp_${NCBITAX}_custom_matches.bout ${TMP_PATH}/tmp_custom_aln* else # In case the file does not exist, skip this step echo "--------------------------------------------------" echo " ERROR: The user specified a custom PDB database, but the following file does not exist:" echo " ${CUSTOM_PDB}" echo " genEra will continue without including the custom PDB database" fi fi rm ${TMP_PATH}/tmp_${NCBITAX}_DB* rm -r ${TMP_PATH}/tmp_${NCBITAX}_foldseek/ DIAMONDOUT="${TMP_PATH}/${NCBITAX}_Foldseek_results.bout" # Check that the output file of STEP 1 is not empty if [ -s ${DIAMONDOUT} ]; then echo "--------------------------------------------------" echo "Step 1 finished!" echo "The Foldseek table can be found in ${DIAMONDOUT}" echo "This file is usually QUITE BIG, please dispose of it if you no longer find it useful" echo "It can still be used (-p) in case the user wants to re-run genEra while skipping step 1" # Otherwise, display error and exit else echo echo " ERROR: The Foldseek table is empty. Please check that foldseek and/or the target database were correctly installed" echo " Exiting" echo exit 1 fi fi ####################################################### ### GenEra: STEP 2 ### ####################################################### # Generate a taxonomic database based on the daxdump from the NCBI and the output of ncbitax2lin if [ -n "$CUSTOMTAX" ]; then if [ -s ${CUSTOMTAX} ]; then echo echo "THE SPECIES-TAILORED TAXONOMIC DATABASE WAS PROVIDED BY THE USER. SKIPPING STEP 2" else echo echo " ERROR: The species-tailored file provided by the user is empty" echo " Exiting" echo exit 1 fi else echo echo "STARTING STEP 2: GENERATING TAXONOMIC DATABASE FOR THE PHYLOSTRATIGRAPHIC ASSIGNMENT OF YOUR GENES" if [ -n "$RAWTAX" ]; then if [ -s ${RAWTAX} ]; then echo "--------------------------------------------------" echo "Using the raw \"ncbi_lineages\" file provided by the user. Skiping ncbitax2lin" else echo echo " ERROR: The raw \"ncbi_lineages\" file provided by the user is empty" echo " Exiting" echo exit 1 fi else # Check whether ncbitax2lin exists in the PATH TAX2LINPATH=$(which ncbitax2lin) if [[ -z ${TAX2LINPATH} ]]; then echo echo " ERROR: The user attempted to generate the raw \"ncbi_lineages\" file from the NCBI taxdump, but genEra could not find the ncbitax2lin excecutable. Unable to run step 2" echo " Please ensure that ncbitax2lin is correctly installed in your PATH and try again" echo " Or try to use an already existing \"ncbi_lineages\" file with argument -r" echo " The user can resume genEra from this step by including the DIAMOND/MMseqs2 file ${DIAMONDOUT} in the pipeline (-p)" exit 1 fi echo "--------------------------------------------------" echo "Running ncbitax2lin to generate a raw \"ncbi_lineages\" file from the NCBI taxdump" # Verify that the taxdump contains the necessary files to run ncbitax2lin if [[ ! -f ${TAXDUMP}/nodes.dmp ]] || [[ ! -f ${TAXDUMP}/names.dmp ]]; then echo echo " ERROR: genEra could not find \"nodes.dmp\" and/or \"names.dmp\" within the specified taxdump directory:" echo " ${TAXDUMP}/" echo " Please ensure that the route to the daxdump directory is correct, and that both files exist within that directory" echo " Or try to use an already existing \"ncbi_lineages\" file with argument -r" echo " The user can resume genEra from this step by including the DIAMOND/MMseqs2 file ${DIAMONDOUT} in the pipeline (-p)" exit 1 fi # Generate the raw ncbi_lineages file with ncbitax2lin ncbitax2lin --nodes-file ${TAXDUMP}/nodes.dmp --names-file ${TAXDUMP}/names.dmp gunzip ncbi_lineages_*.csv.gz mv ncbi_lineages_*.csv ${OUTDIR}/ RAWTAX=$(ls ${OUTDIR}/ncbi_lineages_*.csv) echo "--------------------------------------------------" echo "Finished generating a raw \"ncbi_lineages\" file named ${RAWTAX}" echo "Keep it in case you want to run genEra with another species (-r)" fi # Verify that the raw ncbi_lineages file exists if [[ ! -f ${RAWTAX} ]]; then echo echo " ERROR: genEra could not find the raw \"ncbi_lineages\" file:" echo " ${RAWTAX}" echo " If step 1 ran succesfully, the user can resume from this step using the argument -p" echo " Exiting" exit 1 fi # Verify that the raw ncbi_lineages file is not empty if [ -s ${RAWTAX} ]; then echo "--------------------------------------------------" echo "Rearranging the raw \"ncbi_lineages\" file by taxonomic hierarchy" else echo echo " ERROR: The raw \"ncbi_lineages\" file is empty" echo " Check that ncbitax2lin was correctly installed. Otherwise, you can download a pre-generated raw \"ncbi_lineages\" file from GenEra's github and use ir with the argument -r" echo " Exiting" echo exit 1 fi # Extract the line in the "ncbi_lineages" file corresponding to the query organism QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${RAWTAX}) # Download the lineage hierarchy of the query species from the NCBI webpage (in case fullnamelineage.dmp is not found) if [[ ! -f ${TAXDUMP}/fullnamelineage.dmp ]]; then echo " WARNING: genEra could not find the file fullnamelineage.dmp within the NCBI taxdump" echo " Please make sure you download the latest version of the NCBI Taxonomy dump: ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" wget -q "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=${NCBITAX}&lvl=3&p=has_linkout&p=blast_url&p=genome_blast&lin=f&keep=1&srchmode=1&unlock" fi if [[ -f ${TAXDUMP}/fullnamelineage.dmp ]]; then echo "Extracting lineage information from ${TAXDUMP}/fullnamelineage.dmp" LINEAGE_ARRAY=() IFS=$'\n' for LINEAGE in $(awk -F "|" -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $3 }' ${TAXDUMP}/fullnamelineage.dmp | sed 's/\t//g' | sed 's/; /\n/g' | sed '/^[[:space:]]*$/d' | grep -v "incertae sedis"); do LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$LINEAGE" | cut -d: -f1 && echo)) done TOP_LEVEL=$(awk -F "|" -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $2 }' ${TAXDUMP}/fullnamelineage.dmp | sed 's/\t//g') LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$TOP_LEVEL" | cut -d: -f1 && echo)) # Extract the columns in "ncbi_lineages" in the correct hierarchical order touch ${TMP_PATH}/tmp_column_0 PRINTCOLUMN="${TMP_PATH}/tmp_column_0" for LINEAGECOL in "${LINEAGE_ARRAY[@]}"; do awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," -v LINEAGECOL="$LINEAGECOL" '{ print $LINEAGECOL }' | paste -d , - ${PRINTCOLUMN} > ${TMP_PATH}/tmp_column_${LINEAGECOL} PRINTCOLUMN="${TMP_PATH}/tmp_column_${LINEAGECOL}" done awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1 }' | paste -d , - ${PRINTCOLUMN} | sed 's/,$//' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages rm ${TMP_PATH}/tmp_column_* # Extract the philostrata corresponding to the query organism QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages) # Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms") if [[ -f $QUERY_FASTA ]]; then GENENUM=$(grep -c ">" ${QUERY_FASTA}) elif [[ -d $QUERY_PDB ]]; then GENENUM=$(ls ${QUERY_PDB} | wc -l) fi OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }') echo "--------------------------------------------------" echo "Extracting all the lineages that match more than ${MINHITS} percent of your query proteins" HITPERC=$(( ${GENENUM}*${MINHITS}/100 )) awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches # Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis COUNTER=2 echo "--------------------------------------------------" echo "Collapsing the phylostrata that are not represented in your DIAMOND results" cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY=() COLLAPSEDSTRATA=() IFS=$'\n' for STRATUM in $(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2); do MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l) if [ "$MATCHINGTAXA" -ge 1 ]; then awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY+=$COUNTER, else COLLAPSEDSTRATA+="[${STRATUM}] " fi let COUNTER+=1 done if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then echo "--------------------------------------------------" echo " WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data:" echo " $COLLAPSEDSTRATA" echo " If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database" fi echo "--------------------------------------------------" echo "Generating the species-tailored database" awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv" for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL} PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}" done cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_* CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv" elif [[ -f wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock ]]; then echo "Extracting lineage information from the NCBI webpage" # Create an array which contains the column numbers corresponding to the phylostrata of the query species LINEAGE_ARRAY=() IFS=$'\n' for LINEAGE in $(grep "ALT=\"no rank" wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock | sed 's/>/\n/g' | grep "Taxonomy browser" wwwtax.cgi\?mode\=Info\&id\=${NCBITAX}\&lvl\=3\&p\=has_linkout\&p\=blast_url\&p\=genome_blast\&lin\=f\&keep\=1\&srchmode\=1\&unlock | sed 's/ Taxonomy browser (//g' | sed 's/)<\/title>//g') LINEAGE_ARRAY+=($(echo ${QPHYLOS} | sed -n $'1s/,/\\\n/gp' | grep -nx "$TOP_LEVEL" | cut -d: -f1 && echo)) # Extract the columns in "ncbi_lineages" in the correct hierarchical order touch ${TMP_PATH}/tmp_column_0 PRINTCOLUMN="${TMP_PATH}/tmp_column_0" for LINEAGECOL in "${LINEAGE_ARRAY[@]}"; do awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," -v LINEAGECOL="$LINEAGECOL" '{ print $LINEAGECOL }' | paste -d , - ${PRINTCOLUMN} > ${TMP_PATH}/tmp_column_${LINEAGECOL} PRINTCOLUMN="${TMP_PATH}/tmp_column_${LINEAGECOL}" done awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1 }' | paste -d , - ${PRINTCOLUMN} | sed 's/,$//' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages rm wwwtax* ${TMP_PATH}/tmp_column_* # Extract the philostrata corresponding to the query organism QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages) # Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms") if [[ -f $QUERY_FASTA ]]; then GENENUM=$(grep -c ">" ${QUERY_FASTA}) elif [[ -d $QUERY_PDB ]]; then GENENUM=$(ls ${QUERY_PDB} | wc -l) fi OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }') echo "--------------------------------------------------" echo "Extracting all the lineages that match more than ${MINHITS} percent of your query proteins" HITPERC=$(( ${GENENUM}*${MINHITS}/100 )) awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches # Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis COUNTER=2 echo "--------------------------------------------------" echo "Collapsing the phylostrata that are not represented in your DIAMOND results" cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY=() COLLAPSEDSTRATA=() IFS=$'\n' for STRATUM in $(echo $QPHYLOS | sed "s/,/\n/g" | tail -n +2); do MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l) if [ "$MATCHINGTAXA" -ge 1 ]; then awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY+=$COUNTER, else COLLAPSEDSTRATA+="[${STRATUM}] " fi let COUNTER+=1 done if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then echo "--------------------------------------------------" echo " WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data:" echo " $COLLAPSEDSTRATA" echo " If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database" fi echo "--------------------------------------------------" echo "Generating the species-tailored database" awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv" for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL} PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}" done cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_* CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv" else echo "--------------------------------------------------" echo " ERROR: genEra was unable to retrieve the lineage information of your query species from the local NCBI taxdump or from the NCBI webpage" echo " genEra will attempt to organize the phylostrata in the correct taxonomic hierarchy" echo " This may result in the loss of some phylostrata from the species-tailored database" # Eliminate the inner commas from some of the taxa in the raw file and arrange the lineages by taxonomic hierarchies awk -F'"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(",", "", $i) } 1' ${RAWTAX} | awk -F "," '{ print $1","$8","$52","$51","$57","$7","$63","$68","$56","$6","$65","$45","$36","$59","$5","$66","$55","$30","$35","$54","$4","$64","$60","$3","$67","$58","$38","$2","$40 }' > ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages # Extract the philostrata corresponding to the query organism QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages) # Extract all the lineages proteins that matched the query sequences under a certain threshold and that belong to the same oldest lineage of the query species (e.g., "cellular organisms") if [[ -f $QUERY_FASTA ]]; then GENENUM=$(grep -c ">" ${QUERY_FASTA}) elif [[ -d $QUERY_PDB ]]; then GENENUM=$(ls ${QUERY_PDB} | wc -l) fi OLDEST=$(echo $QPHYLOS | awk -F "," '{ print $NF }') HITPERC=$(( ${GENENUM}*${MINHITS}/100 )) awk '{ print $NF }' ${DIAMONDOUT} | sort -n -T ${TMP_PATH} | uniq -c | awk -v HITPERC="$HITPERC" '{ if ($1 >= HITPERC) print $2 }' | sed '/^[[:space:]]*$/d' | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | grep "$OLDEST" > ${TMP_PATH}/tmp_${NCBITAX}_matches #Evaluate the existance of unassignable clades in the database MISSINGSTRATA=() awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $10","$11","$12","$13","$14","$15","$16","$17","$18","$19","$20","$21","$22","$23","$24","$25","$26","$27","$28","$29","$41","$42","$43","$44 }' ${RAWTAX} > ${TMP_PATH}/tmp_missingtaxa_${NCBITAX} IFS=$'\n' for MISSING in $(sed "s/,/\n/g" ${TMP_PATH}/tmp_missingtaxa_${NCBITAX}); do if [ -n "$MISSING" ]; then MISSINGSTRATA+="[${MISSING}] " fi done if [ ${#MISSINGSTRATA[@]} -gt 0 ]; then echo "--------------------------------------------------" echo " WARNING: genEra was unable to automatically include the following phylostrata in the analysis, as the NCBI database did not assigned them to a concrete taxonomic level:" echo " $MISSINGSTRATA" echo " If you want to include them in your analysis, make sure that genEra runs in a machine with internet access" echo " Alternatively, the user can generate a custom table (-c) directly from the raw \"ncbi_lineages\" file, using the following command:" echo " awk -F'\"' -v OFS='' '{ for (i=2; i<=NF; i+=2) gsub(\",\", \"\", \$i) } 1' ncbi_lineages_[date].csv | awk -F \",\" '{ print \$1\",\"\$8\",\"\$7\",\"\$6 ... }' > custom_table.csv" fi # Finally, tailor the taxonomic database for the query species with the phylostrata that are useful for the analysis COUNTER=2 echo "--------------------------------------------------" echo "Collapsing the phylostrata that are not represented in your DIAMOND results" cat ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY=() COLLAPSEDSTRATA=() IFS=$'\n' for STRATUM in $(echo $QPHYLOS | sed 's/,,/,NA,/g' | sed 's/,,/,NA,/g' | sed "s/,/\n/g" | tail -n +2); do if [ $STRATUM != "NA" ]; then MATCHINGTAXA=$(awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER == STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches | wc -l) if [ "$MATCHINGTAXA" -ge 1 ]; then awk -F "," -v COUNTER="$COUNTER" -v STRATUM="$STRATUM" '{ if ($COUNTER != STRATUM) print $0 }' ${TMP_PATH}/tmp_${NCBITAX}_matches > ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ARRAY+=$COUNTER, else COLLAPSEDSTRATA+="[${STRATUM}] " fi fi let COUNTER+=1 done if [ ${#COLLAPSEDSTRATA[@]} -gt 0 ]; then echo "--------------------------------------------------" echo " WARNING: The following phylostrata were collapsed due to lack of sufficient genomic data" echo " $COLLAPSEDSTRATA" echo " If you want to include them in your analysis, please add the necessary taxa in a custom database (-a or -f), making sure their last common ancestor to the query species can be assigned to that specific taxonomic level in the NCBI taxonomy database" fi echo "--------------------------------------------------" echo "Generating the species-tailored database" awk -F "," '{ print $1 }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages > ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv PRINTCOLUMN="${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv" for LEVEL in $(echo $ARRAY | sed "s/,/\n/g" | sed '/^[[:space:]]*$/d'); do awk -F "," -v LEVEL="$LEVEL" '{ print $LEVEL }' ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages | paste -d , ${PRINTCOLUMN} - > ${TMP_PATH}/tmp_column_${LEVEL} PRINTCOLUMN="${TMP_PATH}/tmp_column_${LEVEL}" done cat ${PRINTCOLUMN} > ${OUTDIR}/${NCBITAX}_ncbi_lineages.csv rm ${TMP_PATH}/tmp_arranged_${NCBITAX}_ncbi_lineages ${TMP_PATH}/tmp_${NCBITAX}_ncbi_lineages.csv ${TMP_PATH}/tmp_${NCBITAX}_trimmed_matches ${TMP_PATH}/tmp_${NCBITAX}_matches ${TMP_PATH}/tmp_column_* ${TMP_PATH}/tmp_missingtaxa_${NCBITAX} CUSTOMTAX="${OUTDIR}/${NCBITAX}_ncbi_lineages.csv" fi # Add the infraspecies-level nodes to the taxonomic database if [[ -f ${STRAINS} ]]; then echo "--------------------------------------------------" echo "Adding the infraspecies nodes to the species-tailored database" # Extract the taxonomic levels of the query species from the NCBI Taxonomy, minus the NCBI Taxonomy ID QPHYLOS_SUFFIX=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX} | awk -F "," 'BEGIN{OFS=","} {$1="";print}') # Calculate the number of columns in the strains table to generate a string of N commas NUM_COLUMNS=$(awk -F',' 'NR==1{print NF}' ${TMP_PATH}/${TARGET_GENOME}.csv) COMMAS=$(printf ',%.0s' $(seq 1 $NUM_COLUMNS)) # Generate a string with the original lineage line for que query species for removal QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX}) # Remove the line containing the lineage info for the query species and add N commas between column 1 and column 2 of the NCBI taxonomy table grep -v "$QPHYLOS" ${CUSTOMTAX} | sed "s/\([^,]*\),\([^,]*\)/\1$COMMAS\2/" > ${TMP_PATH}/tmp_${NCBITAX}_added_columns # Add the missing taxonomic levels to the infraspecies-level genomes and incorporate these to the NCBI taxonomy table awk -F "," -v QPHYLOS_SUFFIX="$QPHYLOS_SUFFIX" '{ print $0 QPHYLOS_SUFFIX }' ${TMP_PATH}/${TARGET_GENOME}.csv >> ${TMP_PATH}/tmp_${NCBITAX}_added_columns # Replace the species-tailored taxonomy table with the additional infraspecies-level nodes mv ${TMP_PATH}/tmp_${NCBITAX}_added_columns ${CUSTOMTAX} # Remove the "${TARGET_GENOME}.csv" file from tmp rm ${TMP_PATH}/${TARGET_GENOME}.csv fi if [ -s ${CUSTOMTAX} ]; then echo "--------------------------------------------------" echo "Step 2 finished!" echo "your species-tailored database can be found in ${CUSTOMTAX}" echo "Keep this file in case you want to re-run step 3 of genEra with the same species (-c)" else echo echo " ERROR: The species-tailored database is empty! please send me an email to figure out what might be the issue (josue.barrera@tuebingen.mpg.de)" echo " Exiting" echo exit 1 fi fi if [[ ! -f ${CUSTOMTAX} ]]; then echo echo " ERROR: genEra could not find the species-tailored \"ncbi_lineages\" file:" echo " ${CUSTOMTAX}" echo " Please make sure that this file exists, otherwise run step 2 of genEra (-d or -r)" echo " If step 1 ran succesfully, the user can resume from this step using argument -p" echo " Exiting" exit 1 fi ####################################################### ### GenEra: STEP 3 ### ####################################################### # Start the gene age assignment echo echo "STARTING STEP 3: ASSIGNING AGES TO YOUR QUERY GENES WITH Erassignment" # Generate temporal list with all query proteins from step 1 if [[ -f $QUERY_FASTA ]]; then grep ">" ${QUERY_FASTA} | awk '{ print $1 }' | sed 's/>//g' > ${TMP_PATH}/tmp_gene_list elif [[ -d $QUERY_PDB ]]; then ls ${QUERY_PDB} > ${TMP_PATH}/tmp_gene_list fi if [[ -f ${DIVERGENCE} ]]; then # Calculate the size of the genes in the query FASTA or PDB files if [[ -f $QUERY_FASTA ]]; then echo -e "GeneID\tGeneLength" > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths awk '/^>/{if (l!="") print l; print; l=0; next}{l+=length($0)}END{print l}' ${QUERY_FASTA} | awk '{ print $1 }' | sed '$!N;s/\n/\t/g' | sed 's/>//g' >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths elif [[ -d $QUERY_PDB ]]; then echo -e "GeneID\tGeneLength" > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths for PDBFILE in $(ls ${QUERY_PDB}); do PROT_SIZE=$(awk '{ if ($1 == "TER") print $4$5 }' ${QUERY_PDB}/${PDBFILE} | sed 's/A//g') echo -e "${PDBFILE}\t${PROT_SIZE}" >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths done fi # Create header of the bitscore file for abSENSE awk '{ printf( "%s ", $1 ); } END { printf( "\n" ); }' ${DIVERGENCE} | sed 's/[[:space:]]*$//' | sed 's/ /\t/g' | sed -e 's/^/None\t/' > ${OUTDIR}/${NCBITAX}_bitscores.tsv fi if [ $THREADS -gt 1 ]; then if [[ "${FAST_STEP3}" = "true" ]]; then # Split ${DIAMONDOUT} into N files with 848068583 lines due to R number of lines limit | This can be further tested for improving RAM usage split -l 848068583 ${DIAMONDOUT} -d "${TMP_PATH}/Diamond_F3R_" # Split ${DIAMONDOUT} results per gene and store in ${TMP_PATH} echo "--------------------------------------------------" echo "Splitting results per query gene using ${THREADS} threads" Rscript --vanilla ${F3R} -t ${TMP_PATH} -c ${THREADS} rm ${TMP_PATH}/Diamond_F3R_* fi # Split the gene list in accordance with the number of threads SPLIT_NUM=$(wc -l ${TMP_PATH}/tmp_gene_list | cut -d' ' -f1 | while read number; do echo $((number / $THREADS +1)); done) split -l $SPLIT_NUM ${TMP_PATH}/tmp_gene_list ${TMP_PATH}/tmp_gene_sublist_ # Create modified copies of Erassignment, according to the number of threads MULTITHREAD_COUNTER=1 for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_gene_sublist_"); do sed "s/tmp_gene_list/$SUBLIST/g" ${PHYLOSCRIPT} | sed "s/tmp_phylostrata/tmp_phylostrata_$MULTITHREAD_COUNTER/g" | sed "s/tmp_sub_phylostrata/tmp_sub_phylostrata_$MULTITHREAD_COUNTER/g" > ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER && chmod +x ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER let MULTITHREAD_COUNTER+=1 done # Print the header of the output files echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi # Run the multithreaded phylostratigraphic assignment echo "--------------------------------------------------" echo "Running Erassignment using ${THREADS} threads" for SUBSCRIPT in $(ls ${TMP_PATH}/Erassignment_*); do ${SUBSCRIPT} ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${FAST_STEP3} ${DIVERGENCE} & done wait rm ${TMP_PATH}/Erassignment_* sort ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${OUTDIR}/tmp_assignment && mv ${OUTDIR}/tmp_assignment ${OUTDIR}/${NCBITAX}_gene_ages.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then sort ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv > ${OUTDIR}/tmp_deepest_homolog && mv ${OUTDIR}/tmp_deepest_homolog ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi # Otherwise run as single-threaded job (not recommended) else echo "--------------------------------------------------" echo "Running Erassignment using a single thread" echo "This WILL take a while" echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi if [[ "${FAST_STEP3}" = "true" ]]; then # If FAST_STEP3 is setted as true with single thread turn to false FAST_STEP3='false' echo "-F (FAST_STEP3) implementation is only available for runs with multiple threads specified. Setting -F to false." fi Erassignment ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${FAST_STEP3} ${DIVERGENCE} fi if [[ "${JACKHMMER}" = "true" && -f $QUERY_FASTA ]]; then echo "--------------------------------------------------" echo "The user requested an additional jackhmmer step on top of DIAMOND" echo "Re-analyzing all genes with a gene age rank of ${HMMERTEST} or higher using hmmEra" echo "Be aware that this extra step requires internet connection and is VERY SLOW" hmmEra -f ${QUERY_FASTA} -a ${OUTDIR}/${NCBITAX}_gene_ages.tsv -e ${EVALUE} -s ${HMMERTEST} -o ${TMP_PATH}/${NCBITAX}_HMMER_results.bout cat ${TMP_PATH}/${NCBITAX}_HMMER_results.bout >> ${DIAMONDOUT} echo "--------------------------------------------------" echo "Re-calculating gene ages after running hmmEra" if [ $THREADS -gt 1 ]; then SPLIT_NUM=$(wc -l ${TMP_PATH}/tmp_gene_list | cut -d' ' -f1 | while read number; do echo $((number / $THREADS +1)); done) split -l $SPLIT_NUM ${TMP_PATH}/tmp_gene_list ${TMP_PATH}/tmp_gene_sublist_ MULTITHREAD_COUNTER=1 for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_gene_sublist_"); do sed "s/tmp_gene_list/$SUBLIST/g" ${PHYLOSCRIPT} | sed "s/tmp_phylostrata/tmp_phylostrata_$MULTITHREAD_COUNTER/g" | sed "s/tmp_sub_phylostrata/tmp_sub_phylostrata_$MULTITHREAD_COUNTER/g" > ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER && chmod +x ${TMP_PATH}/Erassignment_$MULTITHREAD_COUNTER let MULTITHREAD_COUNTER+=1 done echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi echo "Running Erassignment using ${THREADS} threads" for SUBSCRIPT in $(ls ${TMP_PATH}/Erassignment_*); do ${SUBSCRIPT} ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${DIVERGENCE} & done wait rm ${TMP_PATH}/Erassignment_* sort ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${OUTDIR}/tmp_assignment && mv ${OUTDIR}/tmp_assignment ${OUTDIR}/${NCBITAX}_gene_ages.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then sort ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv > ${OUTDIR}/tmp_deepest_homolog && mv ${OUTDIR}/tmp_deepest_homolog ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi else echo "Running Erassignment using a single thread" echo "This WILL take a while" echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_gene_ages.tsv echo -e "#gene\tpossible_phylostrata" > ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv if [[ "${PRINTOLDEST}" = "true" ]]; then echo -e "#qseqid\tsseqid\tevalue\tbitscore\tstaxids" > ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv fi Erassignment ${DIAMONDOUT} ${CUSTOMTAX} ${NCBITAX} ${THRESHOLD} ${TMP_PATH} ${PRINTOLDEST} ${OUTDIR} ${DIVERGENCE} fi elif [[ "${JACKHMMER}" = "true" && -d $QUERY_PDB ]]; then echo "--------------------------------------------------" echo "The user requested an additional jackhmmer step on top of Foldseek" echo "Unfortunately, jackhmmer does not accept PDB files" echo "Therefore, this step will be skipped" fi # Calculate the number of genes per phylostratum echo -e "#number_of_genes\tphylostratum\tphylorank" > ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX}) PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l) IFS=$'\n' for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2); do NUMGENES=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_gene_ages.tsv | wc -l) echo -e "${NUMGENES}\t${STRATUM}\t${PHYLORANK}" >> ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv let PHYLORANK-=1 done rm ${TMP_PATH}/tmp_gene_* if [[ -f ${TMP_PATH}/tmp_${NCBITAX}_sequences ]]; then rm ${TMP_PATH}/tmp_${NCBITAX}_sequences fi echo "--------------------------------------------------" echo "Running mcl to define gene families" # Run MCL to predict gene families from the query self-match in step 1 mcxload -abc ${TMP_PATH}/tmp_${NCBITAX}.abc --stream-mirror --stream-neg-log10 -stream-tf 'ceil(200)' -o ${TMP_PATH}/tmp_${NCBITAX}.mci -write-tab ${TMP_PATH}/tmp_${NCBITAX}.tab mcl ${TMP_PATH}/tmp_${NCBITAX}.mci -I 1.5 -te ${THREADS} -o ${TMP_PATH}/tmp_${NCBITAX}.mcl -V all mcxdump -icl ${TMP_PATH}/tmp_${NCBITAX}.mcl -tabr ${TMP_PATH}/tmp_${NCBITAX}.tab -o ${TMP_PATH}/tmp_${NCBITAX}.dump # Eliminate all the temporary files rm ${TMP_PATH}/tmp_${NCBITAX}.abc ${TMP_PATH}/tmp_${NCBITAX}.mci ${TMP_PATH}/tmp_${NCBITAX}.tab ${TMP_PATH}/tmp_${NCBITAX}.mcl echo "--------------------------------------------------" echo "Establishing the age and number of gene-family founder events" # Generate a table with the oldest assignable phylostratum per gene family and gene-family size echo -e "#gene_family\tphylostratum\trank\tfamily_size" > ${OUTDIR}/${NCBITAX}_founder_events.tsv IFS=$'\n' for GENECLUSTER in $(cat ${TMP_PATH}/tmp_${NCBITAX}.dump); do GENEFAMILY=$(echo ${GENECLUSTER} | sed 's/\t/,/g') FAMILYSIZE=$(echo ${GENECLUSTER} | sed 's/\t/\n/g' | wc -l) FOUNDER=$(echo ${GENECLUSTER} | sed 's/\t/\n/g' | awk 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${OUTDIR}/${NCBITAX}_gene_ages.tsv | sort -T ${TMP_PATH} -t$'\t' -nk3 | grep -v "Possible contamination or HGT" | head -1 | cut -d' ' -f2,3) if [[ -z ${FOUNDER} ]]; then echo -e "$GENEFAMILY\tPossible contamination or HGT\tNA\t$FAMILYSIZE" >> ${OUTDIR}/${NCBITAX}_founder_events.tsv else echo -e "$GENEFAMILY\t$FOUNDER\t$FAMILYSIZE" >> ${OUTDIR}/${NCBITAX}_founder_events.tsv fi done rm ${TMP_PATH}/tmp_${NCBITAX}.dump # Summarize the number of gene founder events per phylostratum echo -e "#number_of_founders\tphylostratum\tphylorank" > ${OUTDIR}/${NCBITAX}_founder_summary.tsv QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX}) PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l) IFS=$'\n' for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2); do NUMFOUNDERS=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_founder_events.tsv | wc -l) echo -e "${NUMFOUNDERS}\t${STRATUM}\t${PHYLORANK}" >> ${OUTDIR}/${NCBITAX}_founder_summary.tsv let PHYLORANK-=1 done echo "--------------------------------------------------" echo "Step 3 finished!" echo "The age assignment for your individual genes can be found in ${OUTDIR}/${NCBITAX}_gene_ages.tsv" if [[ "${PRINTOLDEST}" = "true" ]]; then echo "The deepest traceable homologs for each of your genes can be found in ${OUTDIR}/${NCBITAX}_deepest_homolog.tsv" fi echo "The possible ages for the genes with a taxonomic representativeness below ${THRESHOLD} percent can be found in ${OUTDIR}/${NCBITAX}_ambiguous_phylostrata.tsv" echo "The estimation of gene family founder events can be found in ${OUTDIR}/${NCBITAX}_founder_events.tsv" echo "The number of individual genes that could be assigned to each phylostratum are summarized in ${OUTDIR}/${NCBITAX}_gene_age_summary.tsv" echo "The number of of gene family founder events per phylostratum are summarized in ${OUTDIR}/${NCBITAX}_founder_summary.tsv" ####################################################### ### GenEra: STEP 4 ### ####################################################### # Run abSENSE, if the user invokes it with -s if [[ -f ${DIVERGENCE} ]]; then # Check that the abSENSE input file is not empty if [ -s ${DIVERGENCE} ]; then echo echo "STARTING STEP 4: CALCULATING DETECTION FAILURE PROBABILITIES WITH abSENSE.py" else echo echo " ERROR: The file with the pairwise evolutionary distances is empty" echo " Exiting" echo exit 1 fi # Split the input files in accordance with the number of threads if [ "${THREADS}" -gt 1 ]; then head -1 ${OUTDIR}/${NCBITAX}_bitscores.tsv > ${TMP_PATH}/bitscores_header head -1 ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths > ${TMP_PATH}/gene_lengths_header tail -n +2 ${OUTDIR}/${NCBITAX}_bitscores.tsv | sort > ${TMP_PATH}/tmp_${NCBITAX}_bitscores tail -n +2 ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths | sort > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header split -l $SPLIT_NUM ${TMP_PATH}/tmp_${NCBITAX}_bitscores ${TMP_PATH}/tmp_${NCBITAX}_bitscores_ split -l $SPLIT_NUM ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_ # Rename the input files according to the number of threads MULTITHREAD_COUNTER=1 for SUBLIST in $(ls ${TMP_PATH} | grep "tmp_${NCBITAX}_bitscores_"); do SUFIXTMP=$(echo ${SUBLIST} | cut -d "_" -f4) cat ${TMP_PATH}/bitscores_header > ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv && cat ${TMP_PATH}/tmp_${NCBITAX}_bitscores_${SUFIXTMP} >> ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv cat ${TMP_PATH}/gene_lengths_header > ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} && cat ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_${SUFIXTMP} >> ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} let MULTITHREAD_COUNTER+=1 done # Eliminate all temporary files generated earlier rm ${TMP_PATH}/bitscores_header ${TMP_PATH}/gene_lengths_header ${TMP_PATH}/tmp_${NCBITAX}_bitscores ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header ${TMP_PATH}/tmp_${NCBITAX}_bitscores_* ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_no_header_* # Run abSENSE using multiple threads echo "--------------------------------------------------" echo "Running abSENSE using ${THREADS} threads" MULTITHREAD_COUNTER=1 while [[ ${MULTITHREAD_COUNTER} -le ${THREADS} ]]; do Run_abSENSE.py --distfile ${DIVERGENCE} --scorefile ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv --Eval ${EVALUE} --genelenfile ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} --out ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results >> ${OUTDIR}/log_${NCBITAX}_abSENSE & let MULTITHREAD_COUNTER+=1 done wait # Merge all the abSENSE subfiles in a single directory mkdir ${OUTDIR}/${NCBITAX}_abSENSE_results head -2 ${TMP_PATH}/1_abSENSE_results/Bitscore_99PI_lowerbound_predictions > ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_lowerbound_predictions head -2 ${TMP_PATH}/1_abSENSE_results/Bitscore_99PI_higherbound_predictions > ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_higherbound_predictions head -2 ${TMP_PATH}/1_abSENSE_results/Detection_failure_probabilities > ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities head -2 ${TMP_PATH}/1_abSENSE_results/Parameter_values > ${OUTDIR}/${NCBITAX}_abSENSE_results/Parameter_values head -2 ${TMP_PATH}/1_abSENSE_results/Predicted_bitscores > ${OUTDIR}/${NCBITAX}_abSENSE_results/Predicted_bitscores MULTITHREAD_COUNTER=1 while [[ ${MULTITHREAD_COUNTER} -le ${THREADS} ]]; do tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Bitscore_99PI_lowerbound_predictions >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_lowerbound_predictions tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Bitscore_99PI_higherbound_predictions >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Bitscore_99PI_higherbound_predictions tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Detection_failure_probabilities >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Parameter_values >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Parameter_values tail -n +3 ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/Predicted_bitscores >> ${OUTDIR}/${NCBITAX}_abSENSE_results/Predicted_bitscores rm ${TMP_PATH}/${NCBITAX}_bitscores_${MULTITHREAD_COUNTER}.tsv ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths_${MULTITHREAD_COUNTER} ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results/* && rmdir ${TMP_PATH}/${MULTITHREAD_COUNTER}_abSENSE_results let MULTITHREAD_COUNTER+=1 done mv ${OUTDIR}/log_${NCBITAX}_abSENSE ${OUTDIR}/${NCBITAX}_abSENSE_results/ # Otherwise run as single-threaded job (not recommended) else echo "--------------------------------------------------" echo "Running abSENSE using a single thread" echo "This WILL take another while" Run_abSENSE.py --distfile ${DIVERGENCE} --scorefile ${OUTDIR}/${NCBITAX}_bitscores.tsv --Eval ${EVALUE} --genelenfile ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths --out ${OUTDIR}/${NCBITAX}_abSENSE_results > ${OUTDIR}/log_${NCBITAX}_abSENSE mv ${OUTDIR}/log_${NCBITAX}_abSENSE ${OUTDIR}/${NCBITAX}_abSENSE_results/ fi echo "--------------------------------------------------" echo "Creating a list of high-confidence gene founder events (homology detection failure probabilities < 0.05 in the closest outgroup)" mv ${TMP_PATH}/tmp_${NCBITAX}_gene_lengths ${OUTDIR}/${NCBITAX}_abSENSE_results/${NCBITAX}_gene_lengths.tsv # Print the headers for the output file of the high-confidence set of genes based on homology detection failure echo -e "#gene\tphylostratum\trank\ttaxonomic_representativeness" > ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv echo -e "#gene_family\tphylostratum\trank\tfamily_size" > ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv # Extract the list of phylostrata in the query species QPHYLOS=$(awk -F "," -v NCBITAX="$NCBITAX" '{ if ($1 == NCBITAX) print $0 }' ${CUSTOMTAX}) PHYLORANK=$(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +2 | wc -l) # Print headers for the summary files echo -e "#number_of_genes\tphylostratum\tphylorank\toutgroup" > ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv echo ${QPHYLOS} | awk -F "," -v PHYLORANK="$PHYLORANK" '{ print "NA\t"$2"\t"PHYLORANK"\tNA" }' >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv echo -e "#number_of_founders\tphylostratum\tphylorank\toutgroup" > ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv echo ${QPHYLOS} | awk -F "," -v PHYLORANK="$PHYLORANK" '{ print "NA\t"$2"\t"PHYLORANK"\tNA" }' >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv # Create a list with the taxonomy of all the taxa that were included in the list with evolutionary distances awk '{ print $1 }' ${DIVERGENCE} | awk -F "," 'FNR==NR{ a[$1]=$0;next } ($1 in a)' - ${CUSTOMTAX} | grep "$OLDEST" > ${TMP_PATH}/tmp_taxa_distances # Create a counter for the homology detection failure test COUNTER_HDF=3 let PHYLORANK-=1 IFS=$'\n' for STRATUM in $(echo ${QPHYLOS} | sed "s/,/\n/g" | tail -n +3); do # Extract the position and name of the outgroup phylostratum in the custom NCBI_lineages file COUNTER_OUTGROUP=$((${COUNTER_HDF} + 1)) OUTGROUP_STRATUM=$(echo ${QPHYLOS} | awk -F "," -v COUNTER_OUTGROUP="$COUNTER_OUTGROUP" '{ print $COUNTER_OUTGROUP }') # Extract the list of outgroup species that can be used to test each of the phylostrata in the analysis awk -F "," -v COUNTER_HDF="$COUNTER_HDF" -v STRATUM="$STRATUM" -v COUNTER_OUTGROUP="$COUNTER_OUTGROUP" -v OUTGROUP_STRATUM="$OUTGROUP_STRATUM" '{ if ($COUNTER_HDF != STRATUM && $COUNTER_OUTGROUP == OUTGROUP_STRATUM) print $1 }' ${TMP_PATH}/tmp_taxa_distances > ${TMP_PATH}/tmp_outgroups # Obtain the outgroup for each phylostratum with the lowest evolutionary distance to the target species HDF_OUTGROUP=$(grep -w -f ${TMP_PATH}/tmp_outgroups ${DIVERGENCE} | sort -T ${TMP_PATH} -t$'\t' -nk2 | head -1 | cut -d " " -f1) # Obtain the column in the bitscore file that corresponds to the outgroup of each phylostratum, so it can be used for the homology detection failure test OUTGROUP_COLUMN=$(head -1 ${OUTDIR}/${NCBITAX}_bitscores.tsv | sed -n $'1s/\t/\\\n/gp' | grep -w -nx "$HDF_OUTGROUP" | cut -d: -f1) # Print all the gene founder events that passed the homology detection failure test awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $1 }' ${OUTDIR}/${NCBITAX}_gene_ages.tsv > ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt grep -v "#" ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities | awk -F "\t" -v OUTGROUP_COLUMN="$OUTGROUP_COLUMN" '{ if ($OUTGROUP_COLUMN <= 0.05) print $1 }' | grep -w -f ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt > ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} && grep -f ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} ${OUTDIR}/${NCBITAX}_gene_ages.tsv >> ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_founder_events.tsv | grep -f ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} >> ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv rm ${TMP_PATH}/tmp_level_${COUNTER_HDF}_genes.txt ${TMP_PATH}/tmp_HDF_${COUNTER_HDF} ${TMP_PATH}/tmp_outgroups # Create the summary files for the gene age assignment and the gene-family founder events if [[ -z ${HDF_OUTGROUP} ]]; then echo -e "NA\t${STRATUM}\t${PHYLORANK}\tNO OUTGROUP" >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv echo -e "NA\t${STRATUM}\t${PHYLORANK}\tNO OUTGROUP" >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv else NUMGENES=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv | wc -l) NUMFOUNDERS=$(awk -F "\t" -v STRATUM="$STRATUM" '{ if ($2 == STRATUM) print $0 }' ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv | wc -l) echo -e "${NUMFOUNDERS}\t${STRATUM}\t${PHYLORANK}\t${HDF_OUTGROUP}" >> ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv echo -e "${NUMGENES}\t${STRATUM}\t${PHYLORANK}\t${HDF_OUTGROUP}" >> ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv fi # Modify the counters and continue with the loop let COUNTER_HDF+=1 let PHYLORANK-=1 done rm ${TMP_PATH}/tmp_taxa_distances mv ${OUTDIR}/${NCBITAX}_bitscores.tsv ${OUTDIR}/${NCBITAX}_abSENSE_results/ echo "--------------------------------------------------" echo "Step 4 finished!" echo "The detection failure probabilities for each for your genes can be found in ${OUTDIR}/${NCBITAX}_abSENSE_results/Detection_failure_probabilities" echo "The list of gene age assignments that cannot be explained by homology detection failure can be found in ${OUTDIR}/${NCBITAX}_HDF_gene_ages.tsv" echo "The list of gene-family founder events that cannot be explained by homology detection failure can be found in ${OUTDIR}/${NCBITAX}_HDF_founder_events.tsv" echo "The summary files for this analysis are found in ${OUTDIR}/${NCBITAX}_HDF_gene_age_summary.tsv and ${OUTDIR}/${NCBITAX}_HDF_founder_summary.tsv" echo "Please refer to the following paper for an in-depth interpretation of homology detection failure probabilities:" echo "Weisman, C.M., Murray, A.W., & Eddy, S.R. (2020). Many, but not all, lineage-specific genes can be explained by homology detection failure. PLoS biology, 18(11), e3000862." fi echo echo "genEra finished at:" date echo echo "Enjoy your results!!!"