''' This Snakefile contains rules to trim reads, map them to a reference genome, convert and sort the resulting bam files and count the mapped reads. ''' rule fastq_trim: ''' This rule trims paired-end reads to improve their quality. Specifically, it removes: - Low quality bases - A stretches longer than 20 bases - N stretches ''' input: reads1 = 'data/{sample}_1.fastq', reads2 = 'data/{sample}_2.fastq', output: trim1 = 'results/{sample}/{sample}_atropos_trimmed_1.fastq', trim2 = 'results/{sample}/{sample}_atropos_trimmed_2.fastq' log: 'logs/{sample}/{sample}_atropos_trimming.log' benchmark: 'benchmarks/{sample}/{sample}_atropos_trimming.txt' container: 'https://depot.galaxyproject.org/singularity/atropos%3A1.1.32--py312hf67a6ed_2' shell: ''' echo "Trimming reads in <{input.reads1}> and <{input.reads2}>" > {log} atropos trim -q 20,20 --minimum-length 25 --trim-n --preserve-order --max-n 10 \ --no-cache-adapters -a "A{{20}}" -A "A{{20}}" \ -pe1 {input.reads1} -pe2 {input.reads2} -o {output.trim1} -p {output.trim2} &>> {log} echo "Trimmed files saved in <{output.trim1}> and <{output.trim2}> respectively" >> {log} echo "Trimming report saved in <{log}>" >> {log} ''' rule read_mapping: ''' This rule maps trimmed reads of a fastq onto a reference assembly. ''' input: trim1 = rules.fastq_trim.output.trim1, trim2 = rules.fastq_trim.output.trim2 output: sam = 'results/{sample}/{sample}_mapped_reads.sam', report = 'results/{sample}/{sample}_mapping_report.txt' log: 'logs/{sample}/{sample}_mapping.log' benchmark: 'benchmarks/{sample}/{sample}_mapping.txt' container: 'https://depot.galaxyproject.org/singularity/hisat2%3A2.2.1--hdbdd923_6' shell: ''' echo "Mapping the reads" > {log} hisat2 --dta --fr --no-mixed --no-discordant --time --new-summary --no-unal \ -x resources/genome_indices/Scerevisiae_index \ -1 {input.trim1} -2 {input.trim2} -S {output.sam} --summary-file {output.report} 2>> {log} echo "Mapped reads saved in <{output.sam}>" >> {log} echo "Mapping report saved in <{output.report}>" >> {log} ''' rule sam_to_bam: ''' This rule converts a sam file to bam format, sorts it and indexes it. ''' input: sam = rules.read_mapping.output.sam output: bam = 'results/{sample}/{sample}_mapped_reads.bam', bam_sorted = 'results/{sample}/{sample}_mapped_reads_sorted.bam', index = 'results/{sample}/{sample}_mapped_reads_sorted.bam.bai' log: 'logs/{sample}/{sample}_mapping_sam_to_bam.log' benchmark: 'benchmarks/{sample}/{sample}_mapping_sam_to_bam.txt' container: 'https://depot.galaxyproject.org/singularity/samtools%3A1.21--h50ea8bc_0' shell: ''' echo "Converting <{input.sam}> to BAM format" > {log} samtools view {input.sam} -b -o {output.bam} 2>> {log} echo "Sorting .bam file" >> {log} samtools sort {output.bam} -O bam -o {output.bam_sorted} 2>> {log} echo "Indexing sorted .bam file" >> {log} samtools index -b {output.bam_sorted} -o {output.index} 2>> {log} echo "Sorted file saved in <{output.bam_sorted}>" >> {log} ''' rule reads_quantification_genes: ''' This rule quantifies the reads of a bam file mapping on genes and produces a count table for all genes of the assembly. ''' input: bam_once_sorted = rules.sam_to_bam.output.bam_sorted, output: gene_level = 'results/{sample}/{sample}_genes_read_quantification.tsv', gene_summary = 'results/{sample}/{sample}_genes_read_quantification.summary' log: 'logs/{sample}/{sample}_genes_read_quantification.log' benchmark: 'benchmarks/{sample}/{sample}_genes_read_quantification.txt' container: 'https://depot.galaxyproject.org/singularity/subread%3A2.0.6--he4a0461_2' shell: ''' echo "Counting reads mapping on genes in <{input.bam_once_sorted}>" > {log} featureCounts -t exon -g gene_id -s 2 -p --countReadPairs \ -B -C --largestOverlap --verbose -F GTF \ -a resources/Scerevisiae.gtf -o {output.gene_level} {input.bam_once_sorted} &>> {log} echo "Renaming output files" >> {log} mv {output.gene_level}.summary {output.gene_summary} echo "Results saved in <{output.gene_level}>" >> {log} echo "Report saved in <{output.gene_summary}>" >> {log} '''