'''
This Snakefile contains rules to trim reads, map them to a reference genome,
convert and sort the resulting bam files and count the mapped reads.
'''

rule fastq_trim:
    '''
    This rule trims paired-end reads to improve their quality. Specifically, it removes:
    - Low quality bases
    - A stretches longer than 20 bases
    - N stretches
    '''
    input:
        reads1 = 'data/{sample}_1.fastq',
        reads2 = 'data/{sample}_2.fastq',
    output:
        trim1 = 'results/{sample}/{sample}_atropos_trimmed_1.fastq',
        trim2 = 'results/{sample}/{sample}_atropos_trimmed_2.fastq'
    log:
        'logs/{sample}/{sample}_atropos_trimming.log'
    benchmark:
        'benchmarks/{sample}/{sample}_atropos_trimming.txt'
    container:
        'https://depot.galaxyproject.org/singularity/atropos%3A1.1.32--py312hf67a6ed_2'
    shell:
        '''
        echo "Trimming reads in <{input.reads1}> and <{input.reads2}>" > {log}
        atropos trim -q 20,20 --minimum-length 25 --trim-n --preserve-order --max-n 10 \
        --no-cache-adapters -a "A{{20}}" -A "A{{20}}" \
        -pe1 {input.reads1} -pe2 {input.reads2} -o {output.trim1} -p {output.trim2} &>> {log}
        echo "Trimmed files saved in <{output.trim1}> and <{output.trim2}> respectively" >> {log}
        echo "Trimming report saved in <{log}>" >> {log}
        '''

rule read_mapping:
    '''
    This rule maps trimmed reads of a fastq onto a reference assembly.
    '''
    input:
        trim1 = rules.fastq_trim.output.trim1,
        trim2 = rules.fastq_trim.output.trim2
    output:
        sam = 'results/{sample}/{sample}_mapped_reads.sam',
        report = 'results/{sample}/{sample}_mapping_report.txt'
    log:
        'logs/{sample}/{sample}_mapping.log'
    benchmark:
        'benchmarks/{sample}/{sample}_mapping.txt'
    container:
        'https://depot.galaxyproject.org/singularity/hisat2%3A2.2.1--hdbdd923_6'
    shell:
        '''
        echo "Mapping the reads" > {log}
        hisat2 --dta --fr --no-mixed --no-discordant --time --new-summary --no-unal \
        -x resources/genome_indices/Scerevisiae_index \
        -1 {input.trim1} -2 {input.trim2} -S {output.sam} --summary-file {output.report} 2>> {log}
        echo "Mapped reads saved in <{output.sam}>" >> {log}
        echo "Mapping report saved in <{output.report}>" >> {log}
        '''

rule sam_to_bam:
    '''
    This rule converts a sam file to bam format, sorts it and indexes it.
    '''
    input:
        sam = rules.read_mapping.output.sam
    output:
        bam = 'results/{sample}/{sample}_mapped_reads.bam',
        bam_sorted = 'results/{sample}/{sample}_mapped_reads_sorted.bam',
        index = 'results/{sample}/{sample}_mapped_reads_sorted.bam.bai'
    log:
        'logs/{sample}/{sample}_mapping_sam_to_bam.log'
    benchmark:
        'benchmarks/{sample}/{sample}_mapping_sam_to_bam.txt'
    container:
        'https://depot.galaxyproject.org/singularity/samtools%3A1.21--h50ea8bc_0'
    shell:
        '''
        echo "Converting <{input.sam}> to BAM format" > {log}
        samtools view {input.sam} -b -o {output.bam} 2>> {log}
        echo "Sorting .bam file" >> {log}
        samtools sort {output.bam} -O bam -o {output.bam_sorted} 2>> {log}
        echo "Indexing sorted .bam file" >> {log}
        samtools index -b {output.bam_sorted} -o {output.index} 2>> {log}
        echo "Sorted file saved in <{output.bam_sorted}>" >> {log}
        '''

rule reads_quantification_genes:
    '''
    This rule quantifies the reads of a bam file mapping on genes and produces
    a count table for all genes of the assembly.
    '''
    input:
        bam_once_sorted = rules.sam_to_bam.output.bam_sorted,
    output:
        gene_level = 'results/{sample}/{sample}_genes_read_quantification.tsv',
        gene_summary = 'results/{sample}/{sample}_genes_read_quantification.summary'
    log:
        'logs/{sample}/{sample}_genes_read_quantification.log'
    benchmark:
        'benchmarks/{sample}/{sample}_genes_read_quantification.txt'
    container:
        'https://depot.galaxyproject.org/singularity/subread%3A2.0.6--he4a0461_2'
    shell:
        '''
        echo "Counting reads mapping on genes in <{input.bam_once_sorted}>" > {log}
        featureCounts -t exon -g gene_id -s 2 -p --countReadPairs \
        -B -C --largestOverlap --verbose -F GTF \
        -a resources/Scerevisiae.gtf -o {output.gene_level} {input.bam_once_sorted} &>> {log}
        echo "Renaming output files" >> {log}
        mv {output.gene_level}.summary {output.gene_summary}
        echo "Results saved in <{output.gene_level}>" >> {log}
        echo "Report saved in <{output.gene_summary}>" >> {log}
        '''