version 1.0 ## Copyright Broad Institute, 2017 ## This script should convert a CRAM to SAM to BAM and output a BAM, BAM Index, and validation report to a Google bucket. If you'd like to do ## this on multiple CRAMS, create a sample set in the Data tab. ## The reason this approach was chosen instead of converting CRAM to BAM directly using Samtools is because Samtools 1.3 produces incorrect ## bins due to an old version of htslib included in the package. Samtools versions 1.4 & 1.5 have an NM issue that causes them to not validate ## with Picard. ## ## TESTED: It was tested using the Genomes in the Cloud Docker image version 2.3.1-1500064817. ## Versions of other tools on this image at the time of testing: ## PICARD_VER=1.1150 ## GATK34_VER=3.4-g3c929b0 ## GATK35_VER=3.5-0-g36282e4 ## GATK36_VER=3.6-44-ge7d1cd2 ## GATK4_VER=4.beta.1 ## SAMTOOLS_VER=1.3.1 ## BWA_VER=0.7.15.r1140 ## TABIX_VER=0.2.5_r1005 ## BGZIP_VER=1.3 ## SVTOOLKIT_VER=2.00-1650 ## It was tested pulling the HG38 reference Fasta and Fai. ## Successfully tested on Cromwell version 47. Does not work on versions < v23 due to output syntax ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. ## ## LICENSING : This script is released under the WDL source code license (BSD-3) (see LICENSE in https://github.com/broadinstitute/wdl). ## Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. ## Please see the docker for detailed licensing information pertaining to the included programs. ## #WORKFLOW DEFINITION workflow CramToBamFlow { input { File ref_fasta File ref_fasta_index File ref_dict File input_cram String sample_name String gotc_docker = "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817" Int preemptible_tries = 3 } #converts CRAM to SAM to BAM and makes BAI call CramToBamTask{ input: ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, input_cram = input_cram, sample_name = sample_name, docker_image = gotc_docker, preemptible_tries = preemptible_tries } #validates Bam call ValidateSamFile{ input: input_bam = CramToBamTask.outputBam, docker_image = gotc_docker, preemptible_tries = preemptible_tries } #Outputs Bam, Bai, and validation report to the FireCloud data model output { File outputBam = CramToBamTask.outputBam File outputBai = CramToBamTask.outputBai File validation_report = ValidateSamFile.report } } #Task Definitions task CramToBamTask { input { # Command parameters File ref_fasta File ref_fasta_index File ref_dict File input_cram String sample_name # Runtime parameters Int addtional_disk_size = 20 Int machine_mem_size = 15 String docker_image Int preemptible_tries } Float output_bam_size = size(input_cram, "GB") / 0.60 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + addtional_disk_size #Calls samtools view to do the conversion command { set -eo pipefail samtools view -h -T ~{ref_fasta} ~{input_cram} | samtools view -b -o ~{sample_name}.bam - samtools index -b ~{sample_name}.bam mv ~{sample_name}.bam.bai ~{sample_name}.bai } #Run time attributes: #Use a docker with samtools. Set this up as a workspace attribute. #cpu of one because no multi-threading is required. This is also default, so don't need to specify. #disk_size should equal input size + output size + buffer runtime { docker: docker_image memory: machine_mem_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries } #Outputs a BAM and BAI with the same sample name output { File outputBam = "~{sample_name}.bam" File outputBai = "~{sample_name}.bai" } } #Validates BAM output to ensure it wasn't corrupted during the file conversion task ValidateSamFile { input { File input_bam Int addtional_disk_size = 10 Int machine_mem_size = 4 String docker_image Int preemptible_tries } String output_name = basename(input_bam, ".bam") + ".validation_report" Int disk_size = ceil(size(input_bam, "GB")) + addtional_disk_size Int command_mem_size = machine_mem_size - 1 command { java -Xmx~{command_mem_size}G -jar /usr/gitc/picard.jar \ ValidateSamFile \ INPUT=~{input_bam} \ OUTPUT=~{output_name} \ MODE=SUMMARY \ IS_BISULFITE_SEQUENCED=false } #Run time attributes: #Use a docker with the picard.jar. Set this up as a workspace attribute. #Read more about return codes here: https://github.com/broadinstitute/cromwell#continueonreturncode runtime { docker: docker_image memory: machine_mem_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries continueOnReturnCode: [0,1] } #A text file is generated that will list errors or warnings that apply. output { File report = "~{output_name}" } }