#!/bin/bash
#SBATCH -J ATACseq
#SBATCH -o %x.out
#SBATCH -n 12
#SBATCH --mem-per-cpu=18000


usage(){

    echo "A T A C - S E Q   W O R K F L O W - @bixBeta"
    echo
    echo

    echo "Usage: bash" $0 "[-h arg] [-p arg] [-d args] [-t arg] [-g arg] [-q arg]"
    echo
    echo "---------------------------------------------------------------------------------------------------------------"
    echo "[-h] --> Display Help"
    echo "[-p] --> Project Identifier Number"
    echo "[-d] --> Comma Spearated Values for Delimiter and Field <delim,field or default> default: _,5 "
    echo "[-t] --> Trimming <nextseq or nova>;"
    echo "[-g] --> Reference Genome <mm10 or hg38>"
    echo "[-q] --> Execute atacQC.R script <yes>"
    echo "---------------------------------------------------------------------------------------------------------------"
}



declare -A genomeDir

genomeDir=(
["mm10"]="/workdir/genomes/Mus_musculus/mm10/ENSEMBL/BWAIndex/genome.fa" \
["hg38"]="/workdir/genomes/Homo_sapiens/hg38/ENSEMBL/bwa.index/Homo_sapiens.GRCh38.dna.toplevel.fa"
)

declare -A gtfs

gtfs=(
["mm10"]="/workdir/genomes/Mus_musculus/mm10/ENSEMBL/Mus_musculus.GRCm38.96.gtf" \
["hg38"]="/workdir/genomes/Homo_sapiens/hg38/ENSEMBL/Homo_sapiens.GRCh38.96.gtf"
)

declare -A gAlias # for compatibility with atacQC.R
gAlias=(
["mm10"]="mouse" \
["hg38"]="human"
)

declare -A gSize # for macs2
gSize=(
["mm10"]="mm" \
["hg38"]="hs"
)

declare -A blkList
blkList=(
["mm10"]="/workdir/genomes/Mus_musculus/mm10/ENSEMBL/mm10-blacklist.v2.bed" \
["hg38"]="/workdir/genomes/Homo_sapiens/hg38/ENSEMBL/hg38-blacklist.v2.bed"
)


trimPE(){

        cd fastqs
        ls -1 *_R1.fastq* > .R1
        ls -1 *_R2.fastq* > .R2
        paste -d " " .R1 .R2 > Reads.list

        readarray fastqs < Reads.list
        mkdir fastQC

        for i in "${fastqs[@]}"
        do
                trim_galore --nextseq 20 --length 50  -j 8 --paired --gzip --fastqc --fastqc_args "-t 4 --outdir ./fastQC" $i
        done

        mkdir TrimQC_stats trimmed_fastqs
        mv *_trimming_report.txt TrimQC_stats
        mv *_val* trimmed_fastqs
        mv TrimQC_stats fastQC trimmed_fastqs ../

        cd ..
}


trimHiSeqPE(){

                cd fastqs
                ls -1 *_1.fq* > .R1
                ls -1 *_2.fq* > .R2
                paste -d " " .R1 .R2 > Reads.list

                readarray fastqs < Reads.list
                mkdir fastQC

                for i in "${fastqs[@]}"
                do
                        trim_galore --quality 20 --gzip -j 8 --length 50  --paired --fastqc --fastqc_args "-t 4 --outdir ./fastQC" $i
                done

                mkdir TrimQC_stats trimmed_fastqs
                mv *_trimming_report.txt TrimQC_stats
                mv *_val* trimmed_fastqs
                mv TrimQC_stats fastQC trimmed_fastqs ..

                cd ..
}




alignPE(){

        cd trimmed_fastqs

        ls -1 *_val_1.fq.gz > .trR1
        ls -1 *_val_2.fq.gz > .trR2
        paste -d " " .trR1 .trR2 > Trimmed.list

        readarray trimmedFastqs < Trimmed.list

        for i in "${trimmedFastqs[@]}"

        do
                # INDEX="/workdir/genomes/Mus_musculus/mm10/ENSEMBL/BWAIndex/genome.fa"
                iSUB=`echo $i | cut -d ${DELIMITER} -f${FIELD}`

                bwa mem -t 24 -M -R "@RG\tID:${iSUB}\tSM:${iSUB}\tPL:ILLUMINA\tLB:${iSUB}\tPU:1" ${genomeDir[${DIR}]} $i \
                | samtools view -@ 24 -b -h -F 0x0100 -O BAM -o ${iSUB}.bam
        done

                mkdir primary-BAMS
                mv *.bam primary-BAMS
                mv primary-BAMS ..
                cd ..
}

sort(){
                cd primary-BAMS
            for i in *.bam
            do
            samtools sort $i > `echo  $i | cut -d "." -f1`.sorted.bam
            done

            for i in *.sorted.bam
            do
                samtools index $i
            done

                    # alignment stats etc. on raw bams
                    for i in *.sorted.bam
                    do
                        iSUB=`echo $i | cut -d "." -f1`
                        samtools flagstat $i > ${iSUB}.primary.flagstat
                        samtools idxstats $i > ${iSUB}.primary.idxstats
                    done

                cd ..
                pwd
}





rmMT(){
                cd primary-BAMS
                for i in *.sorted.bam
                do

                    iSUB=`echo $i | cut -d "." -f1`

                    samtools view -H `ls -1 *.sorted.bam | head -1` | cut -f2 | grep "SN:" |  cut -d ":" -f2 | grep -v "MT\|_\|\." | xargs samtools view -b $i > ${iSUB}.noMT.bam

                done



                for i in *.noMT.bam
                do
                    iSUB=`echo $i | cut -d "." -f1`
                    bedtools intersect -v -a $i -b ${blkList[${DIR}]} > ${iSUB}.noBlacklist.noMT.bam
                done


                cd ..
}

markDups(){
                cd primary-BAMS
        for i in *.noBlacklist.noMT.bam
        do
            iSUB=`echo $i | cut -d "." -f1`
            java -jar /programs/bin/picard-tools/picard.jar \
            MarkDuplicates \
            INPUT=$i \
            OUTPUT=${iSUB}.dupMarked.noMT.bam \
            ASSUME_SORTED=true \
            REMOVE_DUPLICATES=false \
            METRICS_FILE=${iSUB}.MarkDuplicates.metrics.txt \
            VALIDATION_STRINGENCY=LENIENT \
            TMP_DIR=tmp

        done
                cd ..
}

dedupBAM(){
                cd primary-BAMS
                # alignment stats etc. on dupMarked no MT bams
                for i in *.dupMarked.noMT.bam
                do
                    iSUB=`echo $i | cut -d "." -f1`
                    samtools index $i
                    samtools flagstat $i > ${iSUB}.noMT.flagstat
                    samtools idxstats $i > ${iSUB}.noMT.idxstats
                done

        for i in *.dupMarked.noMT.bam
        do
                iSUB=`echo $i | cut -d "." -f1`
                samtools view -b -h -F 0X400 $i > ${iSUB}.DEDUP.bam
        done

                for i in *.DEDUP.bam; do samtools index $i ; samtools idxstats $i > `echo $i | cut -d "." -f1`.DEDUP.idxstats; done
                for i in *.DEDUP.bam; do samtools flagstat $i > `echo $i | cut -d "." -f1`.DEDUP.flagstat; done

                multiqc -n ${PIN}.multiqc.report .

                mkdir dedup-BAMS
                mv *.DEDUP* dedup-BAMS/
                mv dedup-BAMS ..
                cd ..

}

tagDir(){
  cd dedup-BAMS
  for i in *.DEDUP.bam
  do
  iSUB=`echo "$i" | cut -d'.' -f1` # subset to rename
  /home/fa286/bin/HOMER/bin/makeTagDirectory "$iSUB".tag.dir "$i"
  done
  cd ..
}

callPeak(){

  cd dedup-BAMS
    echo "calling peaks on DEDUP bams"
    mkdir peaks.OUT
    for i  in *.DEDUP.bam
      do
        iSUB=`echo $i | cut -d "." -f1`
        macs2 callpeak -t $i \
        -f BAMPE \
        -n ${iSUB} \
        -g ${gSize[${DIR}]} \
        -q 0.05 \
        --outdir peaks.OUT \
        --nomodel --shift 37 --ext 73 \
        --fe-cutoff 5 \
        --keep-dup all
      done

  cd ..

}


mergedPeaks(){

  echo "running module mergedPeaks"
  cd dedup-BAMS

    allBams=`echo *.DEDUP.bam`

    macs2 callpeak -t ${allBams} \
    -f BAMPE \
    -n allSamplesMergedPeakset \
    -g ${gSize[${DIR}]} \
    -q 0.05 \
    --outdir peaks.OUT \
    --nomodel --shift 37 --ext 73 \
    --fe-cutoff 5 \
    --keep-dup all

  cd ..
}


saf(){
  # awk 'BEGIN{FS=OFS="\t"; print "GeneID\tChr\tStart\tEnd\tStrand"}{print $4, $1, $2+1, $3, "."}' ${sample}_peaks.narrowPeak > ${sample}_peaks.saf
  cd dedup-BAMS/peaks.OUT
  awk 'BEGIN{FS=OFS="\t"; print "GeneID\tChr\tStart\tEnd\tStrand"}{print $4, $1, $2+1, $3, "."}' allSamplesMergedPeakset_peaks.narrowPeak > allSamplesMergedPeakset.saf
  cd ../../
}

#saf

frip(){
  # featureCounts -p -a ${sample}_peaks.saf -F SAF -o readCountInPeaks.txt ${sample}.sorted.marked.filtered.shifted.bam
  cd dedup-BAMS/
    for i  in *.DEDUP.bam
    do
      iSUB=`echo $i | cut -d "." -f1`
      featureCounts -p -a peaks.OUT/allSamplesMergedPeakset.saf -F SAF -o "${iSUB}".readCountInPeaks.txt $i
    done

  cd ..
}

annotatePeaks(){

    cd dedup-BAMS/peaks.OUT
    /home/fa286/bin/HOMER/bin/annotatePeaks.pl allSamplesMergedPeakset.saf ${DIR} -gtf ${gtfs[${DIR}]} > allSamplesMergedPeakset.Annotated.saf
    cd ../..
}

bedGraphs(){
  cd dedup-BAMS
    for i in *.tag.dir
    do
        makeUCSCfile ${i} -o auto -fsize 1e10 -res 1 -color 106,42,73 -style chipseq
    done

    mkdir tagDirs
        mv *.tag.dir tagDirs
        cd tagDirs
        mkdir bedGraphs
            for i in *.tag.dir
            do
                cd $i
                zcat *.ucsc.bedGraph.gz | awk '{if(NR>1) print "chr"$0; else print $0}' | gzip > `basename *.ucsc.bedGraph.gz .ucsc.bedGraph.gz`.ucsc.bg.gz
                mv *.ucsc.bg.gz ../bedGraphs
                cd ..
            done
        cd ..

    mkdir featureCounts
    mv *.txt featureCounts

    multiqc -n ${PIN}.FRIP.multiqc.report -b "Please note that the featureCounts M Assigned Column refers to Fragments and Not Reads" --ignore tagDirs --ignore peaks.OUT .

  cd ..
}

atacQC(){

    cd dedup-BAMS
    echo "genome alias" = ${gAlias[${DIR}]}
    Rscript /home/fa286/bin/scripts/atacQC.R ${gAlias[${DIR}]}
    # ${gAlias[${DIR}]}
    ~/bin/scripts/html.atacQC.sh `echo ${PIN}_atacQC`

    cd ..

    /home/fa286/bin/tree-1.7.0/tree > folder.structure

}


while getopts "hp:t:g:q:d:" opt; do
    case ${opt} in

    h)
        echo
        echo
        echo
        usage
        echo
        echo
        exit 1

    ;;

    p )

        PIN=$OPTARG
        echo "Project Identifier = " $PIN
    ;;

    t )

        T=$OPTARG

    ;;

    g)

        DIR=$OPTARG

    ;;

    q)

        QC=$OPTARG

    ;;

    d)
        DELIM=$OPTARG

    ;;


    \? )
        echo
        echo
        echo
        usage

    ;;

    esac

done
# shift $((OPTIND -1))

#-------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------
## check if PIN is provided

if [[ -z "${PIN+x}" ]]; then

    PIN="PIN_Null"
fi

# PARAMETER CHECKS

                    #-------------------------------------------------------------------------------------------------------------
                    #-------------------------------------------------------------------------------------------------------------
                    ## check if delimiter parameter exists
                    if [[ ! -z "${DELIM+x}" ]]; then
                        #statements
                        if [[ $DELIM == default ]]; then

                        DELIMITER="_"
                        FIELD="5"
                        echo "file naming will be done using the default delimiter settings"
                      else

                        DELIMITER=`echo $DELIM | cut -d , -f1`
                        FIELD=`echo $DELIM | cut -d , -f2-`
                        echo "file naming will be done using the delim = $DELIMITER and field = $FIELD settings"

                      fi

                    fi

                    #-------------------------------------------------------------------------------------------------------------
                    #-------------------------------------------------------------------------------------------------------------
                    ## check if trimming parameter exists

                    if [[ ! -z "${T+x}" ]]; then

                        if [[ $T == nextseq ]]; then
                            trimPE
                        elif [[ $T == nova ]]; then
                            trimHiSeqPE
                        else
                        echo "-t only accepts nextseq or nova as arguments"
                        exit 1

                        fi
                    fi

                    #-------------------------------------------------------------------------------------------------------------
                    #-------------------------------------------------------------------------------------------------------------
                    ## check if genomeDir provided

                    if [[ ! -z "${DIR+x}" ]]; then
                        if [ ${genomeDir[${DIR}]+_} ]; then
                            echo Reference genome selected = $DIR
                            echo
                            alignPE
                            sort
                            rmMT
                            markDups
                            dedupBAM
                            callPeak
                            mergedPeaks
                            saf
                            frip
                            tagDir
                            annotatePeaks
                            bedGraphs
                        else
                            echo "The reference genome provided '"$DIR"' is not available"
                            exit 1

                        fi
                    fi

                    if [[ ! -z "${QC+x}" ]]; then

                        if [[ $QC == yes ]]; then
                            atacQC
                        else
                            echo "-q option only accepts yes as an argument"
                            exit 1
                        fi
                    fi






#-------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------

if [[ -z $1 ]] || [[  $1 = "help"  ]] ; then
    #statements
    echo
    echo
    usage
    echo
    echo
    exit 1

else
    echo >> beta5.atac.log
    echo `date -u` >> beta5.atac.log
    echo "Project Identifier Specified = " $PIN >> beta5.atac.log
    echo "Reference Genome Specified   = " $DIR >> beta5.atac.log
    echo "Trimming                     = " $T >> beta5.atac.log
    echo >> beta5.atac.log

    echo "ENV INFO: " >> beta5.atac.log
    echo >> beta5.atac.log
    echo "STAR version:" `~/bin/STAR-2.7.0e/bin/Linux_x86_64/STAR --version` >> beta5.atac.log
    echo "multiqc version:" `~/miniconda2/envs/RSC/bin/multiqc --version` >> beta5.atac.log
    echo "samtools version:" `/programs/bin/samtools/samtools --version` >> beta5.atac.log
    echo "macs2 version: macs2 2.1.0.20150731 " >> beta5.atac.log
    echo "HOMER version: v4.10.4" >> beta5.atac.log
    echo -------------------------------------------------------------------------------------------------- >> beta5.atac.log

fi