#!/bin/sh

# Tested to work on Windows 10 WSL Ubuntu & Ubuntu Xenial
# If if bwa index reference exists (f.e. bwakit), simple mapping is performed
# Mapping requires significantly more storage space

# DEPENDENCIES: java 1.8; downloads needed Java programs
# Standard utils: sh, gawk, sed, grep, wget, time

# DEPENDENCIES FOR ALIGNMENT: samtools (htslib); bwa + index
# https://github.com/samtools/htslib            https://www.biorxiv.org/content/10.1101/397935v1
# https://github.com/samtools/samtools          https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/
# https://github.com/lh3/bwa                    http://arxiv.org/abs/1303.3997                        https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5155401/

# INDEX: recommendation hs38DH see https://github.com/lh3/bwa/blob/master/README-alt.md, https://twitter.com/lh3lh3/status/1073772500838944768 for example
# https://sourceforge.net/projects/bio-bwa/files/bwakit/	https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5522380/

# OPTIONAL: fastp and/or bwakit's bwa-postalt.js with k8 JavaScript interpreter
# https://github.com/OpenGene/fastp		https://doi.org/10.1093/bioinformatics/bty560
# https://github.com/lh3/bwa/tree/master/bwakit https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5522380/
# https://github.com/attractivechaos/k8		https://link.springer.com/chapter/10.1007/978-3-030-02465-9_27

# Consider using Intel or CloudFlare zlib on compatible machines for compression performance
# https://github.com/jtkukunas/zlib             https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/ https://doi.org/10.1093/bioinformatics/btq671


# Some BigY bam files have Casava 1.8 data in the bam read column; not sure what to do with this as it's in the original bam.
# This script preserves it in unmapped bam and filters it out before processing, but the extra space and Casava header can be cleaned up with:
# (samtools view -H sample1.bam; samtools view sample1.bam | awk '{ OFS="\t"; split($1,q," "); $1=q[1]; print }') | samtools view -b -o sample1.cleaned.bam

## Combining BigY hs37 and GRCh38, I removed reference and SANITIZE=true and ran:
# TMP=/mnt/SSD/ ./revert-bam.sh sample1BigY.bam
# TMP=/mnt/SSD/ ./revert-bam.sh sample1BigY2.bam
## Merging leaves reads from both files, apparently even with -c
# samtools merge -@8 -c -f sample1BigYmerge.bam sample1BigY.unmapped.bam sample1BigY2.unmapped.bam
## RevertSam expects reads in specific order, Picard Tools queryname order works best.
# java -Xmx30G -jar picard.jar SortSam I=sample1BigYmerge.bam O=sample1BigYmerge.qname.bam SORT_ORDER=queryname
## Added reference but not SANITIZE that would remove all duplicates, bwa undocumented removed duplicate read-id's:
# TMP=/mnt/SSD/ ./revert-bam.sh sample1BigYmerge.qname.bam
## Could also try --KEEP_FIRST_DUPLICATE=True with RevertSam if you don't want to align it

# File locations; note that these keep the path of the input file
SAMPLE=${1:-sample1.bam}
BASENAME=${SAMPLE%%.bam}
UBAMFILE=${BASENAME}.unmapped.bam
BMAPFILE=${BASENAME}.bwamem.bam
SORTFILE=${BASENAME}.sorted.bam

# Which reference genome to use: https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
# Heng Li, author of bwa mem and blog entry updated bwakit's hs38DH a year later, so it can be considered recommended
REF=hs38DH.fa

# Set compression level for output BAM files, higher if you archive them, lower if you have storage
# http://www.htslib.org/benchmarks/zlib.html https://blog.cloudflare.com/cloudflare-fights-cancer/
COMPRESS=5

# Gets & runs RevertSamSpark and MarkDuplicatesSpark multi-threaded instead of Picard workflow.
# Comment out if you have or want to use only a few CPU cores.
GATK_SPARK=4.6.2.0

# Don't delete bwa-mem mapped, raw, unsorted BAM file. Useful if you intend to test different processing.
KEEP_TEMPORARY=True

# Extra options for BWA, ie. long reads with "-x pacbio" etc. http://bio-bwa.sourceforge.net/bwa.shtml
BWAOPT=

# Ref: https://gatkforums.broadinstitute.org/gatk/discussion/6484
# From page: Additionally, we invoke the SANITIZE option to remove reads that cause problems for certain tools, e.g. MarkIlluminaAdapters.
# Downstream tools will have problems with paired reads with missing mates, duplicated records, and records with mismatches in length of bases and qualities.
# Any paired reads file subset for a genomic interval requires sanitizing to remove reads with lost mates that align outside of the interval.
SANITIZE=true

# Replace this with fixed thread count if you have limited memory or need to reserve CPU threads
cores=`nproc`
# Find out how much memory we have available; you can override java heap on cmdline if you need disk cache, other processes
totalmem=`LC_ALL=C free | grep -e "^Mem:" | gawk '{print $7}'`
# Set Java heap to 80% of all available memory, as metaspace usage has grown in new Java versions.
javamem=${2:-$((totalmem/1024/1024/10*8))}
# https://sourceforge.net/p/picard/wiki/Main_Page/#q-a-picard-program-that-sorts-its-output-sambam-file-is-taking-a-very-long-time-andor-running-out-of-memory-what-can-i-do
bamrecords=$((javamem*250000))
# From Java 6 update 18 max. heap is 1/4th of physical memory, so we can split 3/4th between cores for sorting.
percoremem=$((javamem*3/4/cores))

# If bio-tools.cfg exists where the script is run from, over-ride settings from it.
if [ -e ./bio-tools.cfg ];
then
  . ./bio-tools.cfg
fi

# These aren't set in bio-tools.cfg because they rely on REF, and BASENAME if that were altered.

# Trim sequencing adapters exactly using paired end read overlap for all sequencers and produce reports; disable other trimming
# Get and build it from https://github.com/OpenGene/fastp
if which fastp > /dev/null;
then
  FILTER="| fastp -QLGp --stdin --stdout --interleaved_in -j ${BASENAME}.fastp.json -h ${BASENAME}.fastp.html -R \"fastp report on ${BASENAME}\""
fi

# https://gatkforums.broadinstitute.org/gatk/discussion/8017/how-to-map-reads-to-a-reference-with-alternate-contigs-like-grch38
# k8 is JavaScript command line interpreter https://github.com/attractivechaos/k8 or https://github.com/attractivechaos/k8/releases
# and bwa-postalt.js is from bwakit https://github.com/lh3/bwa/tree/master/bwakit or https://github.com/lh3/bwa/raw/master/bwakit/bwa-postalt.js
# Easiest way to get it working is to get the prebuild Linux or Mac binary from releases link and copy to /usr/local/bin
if which k8 > /dev/null;
then
  POSTALT="| k8 ./bwa-postalt.js -p ${BASENAME}.hla ${REF}.alt"
fi

# Different fs from data files, prefer SSD
tmp=${TMP:-"/tmp"}

if [ ! -e ${SAMPLE} ];
then
  echo "Runs Picard Tools or GATK Spark on Java to create Broad Institute uBAM from input BAM"
  echo "$0 [input.bam] [gigabytes of memory to use, if not all available memory]"
  exit
fi

# Get a tested version of Picard Tools
if [ ! -e picard.jar ];
then
  wget https://github.com/broadinstitute/picard/releases/download/2.27.1/picard.jar
fi

if [ ! -z "${GATK_SPARK}" ] && [ ! -e gatk-${GATK_SPARK}/gatk ];
then
  if ! echo "${SANITIZE}" | grep -qi 'true';
  then
    echo "You can't use GATK_SPARK MarkDuplicatesSpark with single end reads, leave SANITIZE 'true'."
    exit
  fi
  wget -nc https://github.com/broadinstitute/gatk/releases/download/${GATK_SPARK}/gatk-${GATK_SPARK}.zip
  unzip gatk-${GATK_SPARK}.zip
fi

check_space ()
{
  # Figure out sizes in bytes
  inputsize=`du -D -b ${SAMPLE} | gawk '{print $1}'`
  outfree=`LC_ALL=C df -B1 ${SAMPLE%/*} | grep -v "^File" | gawk '{print $4}'`

  if [ ${inputsize} -gt ${outfree} ];
  then
    echo "Output file $1 (${inputsize}) probably won't fit on remaining space (${outfree})"
    exit
  fi

  # Figure out sizes in bytes - rule of thumb temporary is usually 1.5x times input
  tempsize=$((inputsize+(inputsize/2)))
  tempfree=`LC_ALL=C df -B1 ${tmp} | grep -v "^File" | gawk '{print $4}'`

  if [ ${tempsize} -gt ${tempfree} ];
  then
    echo "Approximately 1.5X input size is required for temporary storage $tmp"
    echo "Run with TMP=/path/to/tmp to use different path"
    exit
  fi
}

# https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_RevertSam.php
# See https://gatkforums.broadinstitute.org/gatk/discussion/6484/how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam option B
# The idea is to remove/revert any tags which would interfere or get recreated by workflow; tags which can't be regenerated can stay
# Leaves nh and qf tags in BigY BAM; they seem properietary
if [ ! -e ${UBAMFILE} ];
then
  check_space ${UBAMFILE}

  if [ -z "${GATK_SPARK}" ];
  then
    echo "############ Reverting ${SAMPLE} into ${UBAMFILE} with Picard Tools"
    time java -Xmx${javamem}G -jar picard.jar RevertSam \
      I=${SAMPLE} \
      O=${UBAMFILE} \
      SANITIZE=${SANITIZE} \
      MAX_DISCARD_FRACTION=0.005 \
      ATTRIBUTE_TO_CLEAR=XT \
      ATTRIBUTE_TO_CLEAR=XN \
      ATTRIBUTE_TO_CLEAR=X0 \
      ATTRIBUTE_TO_CLEAR=X1 \
      ATTRIBUTE_TO_CLEAR=XM \
      ATTRIBUTE_TO_CLEAR=XO \
      ATTRIBUTE_TO_CLEAR=XG \
      ATTRIBUTE_TO_CLEAR=MD \
      ATTRIBUTE_TO_CLEAR=NM \
      ATTRIBUTE_TO_CLEAR=MQ \
      ATTRIBUTE_TO_CLEAR=MC \
      ATTRIBUTE_TO_CLEAR=SA \
      ATTRIBUTE_TO_CLEAR=ms \
      ATTRIBUTE_TO_CLEAR=AS \
      ATTRIBUTE_TO_CLEAR=OC \
      ATTRIBUTE_TO_CLEAR=OP \
      ATTRIBUTE_TO_CLEAR=XS \
      ATTRIBUTE_TO_CLEAR=XA \
      ATTRIBUTE_TO_CLEAR=pa \
      SORT_ORDER=queryname \
      RESTORE_ORIGINAL_QUALITIES=true \
      REMOVE_DUPLICATE_INFORMATION=true \
      REMOVE_ALIGNMENT_INFORMATION=true \
      MAX_RECORDS_IN_RAM=${bamrecords} \
      COMPRESSION_LEVEL=${COMPRESS} \
      TMP_DIR=${tmp}
  else
    rm -rf ${tmp}/SPARK
    mkdir -p ${tmp}/SPARK
    echo "############ Reverting ${SAMPLE} into ${UBAMFILE} with GATK Spark"
    time gatk-${GATK_SPARK}/gatk --java-options "-Xmx${javamem}G -Dsamjdk.compression_level=${COMPRESS}" RevertSamSpark \
      -I ${SAMPLE} \
      -O ${UBAMFILE} \
      --sanitize ${SANITIZE} \
      --attributes-to-clear XT \
      --attributes-to-clear XN \
      --attributes-to-clear AS \
      --attributes-to-clear OC \
      --attributes-to-clear OP \
      --attributes-to-clear XS \
      --attributes-to-clear XA \
      --attributes-to-clear pa \
      --sort-order queryname \
      --dont-restore-original-qualities false \
      --remove-duplicate-information true \
      --keep-alignment-information false \
      --tmp-dir ${tmp}/SPARK \
      --output-shard-tmp-dir ${tmp}/SPARK/${SAMPLE##*/}.parts \
        | grep -Ev "INFO (Executor|NewHadoopRDD|ShuffleBlockFetcherIterator|SparkHadoopMapRedUtil|FileOutputCommitter):" --line-buffered
  fi
fi

# WARNING: This is NOT according to the Broad Institute GATK 4.0 Best Practices, but leaving it here for reference
# Does not perform quality control, adapter trimming or base quality score recalibration
if [ -e ${REF} ];
then

  # Identify Complete Genomics/BGI/MGI style read names for library complexity estimation
  try_regex='CL10.......L.C([0-9]+)R([0-9]+)_([0-9]+)'
  if samtools view ${UBAMFILE} | head -1 | cut -f1 | grep -q -E "${try_regex}";
  then
    echo "############   Identified BGI/MGI/Complete Genomics style read names"
    if [ -z "${GATK_SPARK}" ];
    then
      regex="READ_NAME_REGEX=${try_regex}"
    else
      regex="--read-name-regex ${try_regex}"
    fi
  fi

  # Uses .hdr file also as a flag of whether mapping finished, in case we restart script
  # This should probably be "|| [ -e ..." but sometimes I don't want to overwrite partial
  if [ ! -e ${BMAPFILE} ] && [ ! -e ${UBAMFILE}.hdr ];
  then
    check_space ${BMAPFILE}

    samtools view -H ${UBAMFILE} | grep -v "^@HD" > ${UBAMFILE}.hdr
    # FTDNA BigY doesn't have PL tag, so add it for downstream processing.
    if ! grep -q "^@RG.*PL:" ${UBAMFILE}.hdr;
    then
      echo "############   Adding missing platform identifiers"
      sed -i "s/^@RG\t.*/&\tPL:ILLUMINA/" ${UBAMFILE}.hdr
    fi

    echo "############ Using BWA MEM to align ${UBAMFILE} against ${REF} into ${BMAPFILE}"
    # Casava 1.8 header, observed s/[12]:[YN]:[0-9]*:[^\/]*\/[12]\t//
    # According to Wikipedia this can also end in barcode or sample-ID, so removing any.
    time samtools fastq -t ${UBAMFILE} \
      | eval sed "s/[12]:[YN]:[0-9]*:[^[:space:]]*[[:space:]]//" \
      ${FILTER} \
      | eval bwa mem ${BWAOPT} -p -t ${cores} -M -C -H ${UBAMFILE}.hdr ${REF} - \
      ${POSTALT} \
      | samtools view -b -o ${BMAPFILE}
    rm ${UBAMFILE}.hdr
  else
    echo "############ ${BMAPFILE} or ${UBAMFILE}.hdr exists, so skipping mapping"
  fi

  echo "############ Marking duplicates and chromosome order sorting ${UBAMFILE} into ${SORTFILE}"
  check_space ${SORTFILE}

  # Unfortunately, MarkDuplicates seeks back to beginning of the input BAM so mapping can't just be piped in
  # See https://software.broadinstitute.org/gatk/documentation/article?id=6747 for OPTICAL_DUPLICATE_PIXEL_DISTANCE and regex
  if [ -z "${GATK_SPARK}" ];
  then
    time java -jar picard.jar MarkDuplicates INPUT=${BMAPFILE} OUTPUT=/dev/stdout METRICS_FILE=${SAMPLE}.dup \
      ASSUME_SORT_ORDER=queryname TAGGING_POLICY=All COMPRESSION_LEVEL=0 TMP_DIR=$tmp \
      OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 ${regex} \
        | java -jar picard.jar FifoBuffer BUFFER_SIZE=2147483645 DEBUG_FREQUENCY=61 \
        | samtools sort -T $tmp/${SAMPLE##*/} -@${cores} -m${percoremem}G -l${COMPRESS} \
        | tee ${SORTFILE} \
        | samtools index -@${cores} - ${SORTFILE}.bai
  else
    # This took 8:20 vs. 6:17 on 4 cores, before running out of memory in index generation. Spark temp file space about 2X BAM, lots of IO.
    # https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php
    # Results are indentical, but duplication metrics are less detailed, PG header isn't added.
    # --bam-partition-size 33554432 is maximum & default
    rm -rf ${tmp}/SPARK
    mkdir -p ${tmp}/SPARK
    time gatk-${GATK_SPARK}/gatk --java-options "-Xmx${javamem}G -Dsamjdk.compression_level=${COMPRESS}" MarkDuplicatesSpark -I ${BMAPFILE} -O ${SORTFILE} -M ${SAMPLE}.dup \
      --duplicate-tagging-policy All --tmp-dir ${tmp}/SPARK --output-shard-tmp-dir ${tmp}/SPARK/${SAMPLE##*/}.parts --optical-duplicate-pixel-distance 2500 ${regex} 2>&1 \
        | grep -Ev "INFO (Executor|NewHadoopRDD|ShuffleBlockFetcherIterator|SparkHadoopMapRedUtil|FileOutputCommitter):" --line-buffered
    # At least for me, the index file generation fails even with enough memory, so let's just generate it for now.
    mv ${SORTFILE}.bai ${SORTFILE}.bai.bak
    time samtools index -@${cores} ${SORTFILE}
  fi
  # Contrary to MarkDuplicatesSpark documentation, NM and MD tags look fine, but if you need UQ tags you need to run SetNmMdAndUqTags
  # https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_SetNmMdAndUqTags.php

  # set -e and set -o pipefail require bash, so check the final file is intact instead
  if [ -z ${KEEP_TEMPORARY} ] && samtools quickcheck ${SORTFILE};
  then
    echo "############   Deleting aligner temporary output file"
    rm ${BMAPFILE}
  fi
fi