import os #### Quick documentation #### ##Files needed by pipeline: ## - GRChxx.xx.fa ## - GRChxx.xx.fa.fai ## - GRChxx.xx.dict ## - dbSNP_vxxx_xxxx_noCHR.vcf - use remove_chr.py on vcf downloaded from dbSNP FTP site ## - Bed file containing exome captured regions - *.bed ## - config.py - python file containing variables to input in the pipeline. All variables must be declared before #running the pipeline ##Program needed by pipeline: ## - picard.jar - ## - GenomeAnalysisTK.jar - ## - gatk - GATK4 for MuTect2 ######################### ##### SCons Settings #### ######################### vars = Variables("/working/configuration.py") vars.Add('reference', 'The genome reference file',"reference.fa") vars.Add('processors', 'Number of CPUs to be used', "2") vars.Add('dbsnpVCF','The dbSNP VCF',"All_20180418.vcf.gz") vars.Add('cosmic','The Cosmic VCF',"CosmicCodingMuts.vcf.gz") vars.Add('exomeRegions',"The bed file containing exome regions","exome_regions.bed") vars.Add('tumor',"Tumor sample","") vars.Add('normal',"Normal sample","") vars.Add('contamination','Computation of predicted contamination in tumor sample by CalculateContamination of GATK4 (y/n)',"n") vars.Add('gatk4','The path to GATK4',"/tmp/gatk4/gatk") vars.Add('javaOptions','Options to run java',"4") vars.Add('maxPopulationAlleleFrequency',"Maximum population allele frequency of sites to consider for GetPileupSummaries in GATK4","0.2") vars.Add('minPopulationAlleleFrequency',"Minimum population allele frequency of sites to consider for GetPileupSummaries in GATK4","0") vars.Add('gnomadVCF',"The gnomad.exomes.rx.x.x.sites.pass.vcf","gnomad.exomes.r2.0.2.sites.vcf.gz") vars.Add('variantCallers',"Variant callers to be used (mutect2,strelka2,varscan)","mutect2,strelka2,varscan") vars.Add('gatk3','The path to gatk3',"/tmp/gatk3/GenomeAnalysisTK.jar") # MuTect2 additional parameters vars.Add('mutect2Parameters',"Parameters for MuTect2","") # VarScan Parameters vars.Add('varScanMinCoverage', 'VarScan minimum read depeh at a position to make a call [10]', "10") vars.Add('varScanPvalue', "VarScan default p-value threshold for calling variants [1e-01]", "0.1") vars.Add('varScanMinVarFreq', "Minimum variant allele frequency threshold [0.20]", "0.2") vars.Add("varScanParameters", "Parameters to be added to varScan2", "") # snpEff Parameters vars.Add("snpeffDir", "SnpEFF directory", "/annotations") vars.Add("snpeffGenomeVersion", "Genome version to be used by snpEff (like GRCh37.75)", "GRCh37.75") # ClinVAr Parameter vars.Add("clinvar", "Clinvar compressed VCF file", "clinvar_20190311.vcf.gz") env = Environment(ENV = os.environ, SHELL = '/bin/bash', variables = vars) env.AppendENVPath('PATH', os.getcwd()) Decider('timestamp-newer') #################### ##### Arguments #### #################### reference = ARGUMENTS.get("reference", env["reference"]) reference = "/annotations/" + reference processors = ARGUMENTS.get("processors", env["processors"]) dbsnpVCF = ARGUMENTS.get("dbsnpVCF",env["dbsnpVCF"]) dbsnpVCF = "/annotations/" + dbsnpVCF cosmic = ARGUMENTS.get("cosmic",env["cosmic"]) cosmic = "/annotations/" + cosmic exomeRegions = ARGUMENTS.get("exomeRegions",env["exomeRegions"]) exomeRegions = "/annotations/" + exomeRegions clinvar = ARGUMENTS.get("clinvar",env["clinvar"]) clinvar = "/annotations/" + clinvar tumor = ARGUMENTS.get("tumor",env["tumor"]) normal = ARGUMENTS.get("normal",env["normal"]) contamination = ARGUMENTS.get("contamination",env["contamination"]) gatk4 = ARGUMENTS.get("gatk4",env["gatk4"]) gatk3 = ARGUMENTS.get("gatk3", env["gatk3"]) javaOptions = ARGUMENTS.get("javaOptions",env["javaOptions"]) javaOptions = "-Xmx"+str(javaOptions)+"g" maxPopulationAlleleFrequency = ARGUMENTS.get("maxPopulationAlleleFrequency",env["maxPopulationAlleleFrequency"]) minPopulationAlleleFrequency = ARGUMENTS.get("minPopulationAlleleFrequency",env["minPopulationAlleleFrequency"]) gnomadVCF = ARGUMENTS.get("gnomadVCF",env["gnomadVCF"]) gnomadVCF = "/annotations/" + gnomadVCF variantCallers = ARGUMENTS.get("variantCallers",env["variantCallers"]) variantCallersList = variantCallers.split(",") # MuTect2 additional parameters mutect2Parameters = ARGUMENTS.get("mutect2Parameters",env["mutect2Parameters"]) # VarScan Parameters varScanMinCoverage = ARGUMENTS.get("varScanMinCoverage", env["varScanMinCoverage"]) varScanPvalue = ARGUMENTS.get("varScanPvalue", env["varScanPvalue"]) varScanMinVarFreq = ARGUMENTS.get("varScanMinVarFreq", env["varScanMinVarFreq"]) varScanParameters = ARGUMENTS.get("varScanParameters", env["varScanParameters"]) # snpEff parameters snpeffGenomeVersion = ARGUMENTS.get("snpeffGenomeVersion", env["snpeffGenomeVersion"]) #snpeffDir = "/annotations/" + snpeffGenomeVersion snpeffDir = "/annotations/" #snpeffDir = ARGUMENTS.get("sneffDir", env["snpeffDir"]) ############################################ #### Calculate contamination with GATK4 #### ############################################ if contamination == "y" or contamination == "yes": getPileupSummariesCMD = "${SOURCES[0]} GetPileupSummaries -I ${SOURCES[1]}/07_recalibrated.bam -L ${SOURCES[2]} -V ${SOURCES[3]} -O Variants/Mutect2/${SOURCES[1]}_${SOURCES[4]}/00_pileups.table" + " -max-af {} -min-af {}".format(maxPopulationAlleleFrequency,minPopulationAlleleFrequency) getPileupSummaries = env.Command(["Variants/Mutect2/{}_{}/00_pileups.table".format(tumor,normal)],[gatk4,tumor,exomeRegions,gnomadVCF,normal],getPileupSummariesCMD) calculateContaminationCMD = "${SOURCES[0]} CalculateContamination -I ${SOURCES[1]} -O $TARGET" calculateContamination = env.Command(["Variants/Mutect2/{}_{}/00_contamination.table".format(tumor,normal)],[gatk4,getPileupSummaries],calculateContaminationCMD) ####################################### ##### Variant Calling with MuTect2 #### ####################################### if "mutect2" in variantCallersList: muTect2CMD = "${SOURCES[0]} Mutect2 -R ${SOURCES[1]} -I ${SOURCES[2]}/07_recalibrated.bam -tumor ${SOURCES[2]} -I ${SOURCES[3]}/07_recalibrated.bam -normal ${SOURCES[3]} -L ${SOURCES[4]} -O $TARGET --germline-resource ${SOURCES[5]}" + " {}".format(mutect2Parameters)+" -bamout Variants/Mutect2/${SOURCES[2]}_${SOURCES[3]}/01_mutect2.bam"+" --native-pair-hmm-threads {}".format(processors) muTect2 = env.Command(["Variants/Mutect2/{}_{}/01_mutect2.vcf".format(tumor,normal)],[gatk4,reference,tumor,normal,exomeRegions,gnomadVCF],muTect2CMD) if (contamination == "y" or contamination == "yes"): filterMutectCallsCMD = "${SOURCES[0]} FilterMutectCalls -V ${SOURCES[1]} -contamination-table ${SOURCES[2]} -O $TARGET" filterMutectCalls = env.Command(["Variants/Mutect2/{}_{}/02_mutect2_filtered.vcf".format(tumor,normal)],[gatk4,muTect2,calculateContamination],filterMutectCallsCMD) else: filterMutectCallsCMD = "${SOURCES[0]} FilterMutectCalls -V ${SOURCES[1]} -O $TARGET" filterMutectCalls = env.Command(["Variants/Mutect2/{}_{}/02_mutect2_filtered.vcf".format(tumor,normal)],[gatk4,muTect2],filterMutectCallsCMD) #Annotation by snpEff mutect2EFFCMD = "java -jar /tmp/snpEff/snpEff.jar eff -v {} -stats Variants/Mutect2/{}_{}/snpEff_summary.html -dataDir {} $SOURCE > $TARGET".format(snpeffGenomeVersion,tumor,normal,snpeffDir) mutect2EFF = env.Command(["Variants/Mutect2/{}_{}/03_mutect2_snpeff.vcf".format(tumor,normal), "Variants/Mutect2/{}_{}/snpEff_summary.genes.txt".format(tumor,normal), "Variants/Mutect2/{}_{}/snpEff_summary.html".format(tumor,normal)],[filterMutectCalls],mutect2EFFCMD) # Annotation with ClinVar mutect2ClinvarCMD = "java -Xmx2g -jar /tmp/snpEff/SnpSift.jar annotate -info CLNSIG,CLNDN {} $SOURCE > $TARGET".format(clinvar) mutect2Clinvar = env.Command(["Variants/Mutect2/{}_{}/04_mutect2_clinvar.vcf".format(tumor,normal)], [mutect2EFF], mutect2ClinvarCMD) # Cosmic coding variants annotation mutect2CosmicCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(cosmic) mutect2Cosmic = env.Command(["Variants/Mutect2/{}_{}/05_mutect2_cosmic.vcf".format(tumor,normal)], [mutect2Clinvar], mutect2CosmicCMD) # dbSNP annotation mutect2DbsnpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(dbsnpVCF) mutect2Dbsnp = env.Command(["Variants/Mutect2/{}_{}/06_mutect2_dbsnp.vcf".format(tumor,normal)], [mutect2Cosmic], mutect2DbsnpCMD) # gnomAD annotation mutect2gnomadCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(gnomadVCF) mutect2gnomad = env.Command(["Variants/Mutect2/{}_{}/07_mutect2_gnomad.vcf".format(tumor,normal)], [mutect2Dbsnp], mutect2gnomadCMD) # Filtering variants mutect2FilteringCMD = "python /tmp/mutect-vcf-selector.py -f $SOURCE -c /annotations/cancer_gene_census.csv -e > $TARGET" mutect2Filtering = env.Command(["Variants/Mutect2/{}_{}/08_mutect2_filtered.vcf".format(tumor,normal)], [mutect2gnomad], mutect2FilteringCMD) # Tag variant type mutect2VariantTypeCMD= "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar varType $SOURCE > $TARGET" mutect2VariantType = env.Command(["Variants/Mutect2/{}_{}/08.5_mutect2_vartype.vcf".format(tumor,normal)], [mutect2Filtering], mutect2VariantTypeCMD) # Selection of snps mutect2snpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar filter \"(VARTYPE has 'SNP') | (VARTYPE has 'MNP')\" $SOURCE > $TARGET" mutect2snp = env.Command(["Variants/Mutect2/{}_{}/09_mutect2_snps.vcf".format(tumor,normal)], [mutect2VariantType], mutect2snpCMD) # Selection of indels mutect2indelsCMD = "java -jar /tmp/snpEff/SnpSift.jar filter \"(VARTYPE has 'INS') | (VARTYPE has 'DEL') | (VARTYPE has 'MIXED')\" $SOURCE > $TARGET" mutect2indels = env.Command(["Variants/Mutect2/{}_{}/09_mutect2_indels.vcf".format(tumor,normal)], [mutect2VariantType], mutect2indelsCMD) # Copy latest created .vcf of snps to VCF directory copyMutect2AnnotatedSnpsCMD = "cp $SOURCE $TARGET" copyMutect2AnnotatedSnps = env.Command(["VCF/{}_{}_mutect2_snps.vcf".format(tumor,normal)], [mutect2snp], copyMutect2AnnotatedSnpsCMD) # Copy latest created .vcf of indels to VCF directory copyMutect2AnnotatedIndelsCMD = "cp $SOURCE $TARGET" copyMutect2AnnotatedIndels = env.Command(["VCF/{}_{}_mutect2_indels.vcf".format(tumor,normal)], [mutect2indels], copyMutect2AnnotatedIndelsCMD) ####################################### #### Variant Calling with Strelka2 #### ####################################### if "strelka2" in variantCallersList: strelka2configurationCMD = "python /tmp/strelka/bin/configureStrelkaSomaticWorkflow.py --normalBam ${SOURCES[0]}/07_recalibrated.bam --tumorBam ${SOURCES[1]}/07_recalibrated.bam --referenceFasta ${SOURCES[2]} --runDir $TARGET --callRegions ${SOURCES[3]}.gz --exome" strelka2configuration = env.Command(["Variants/Strelka2/{}_{}/strelkaWorkflow".format(tumor,normal)],[normal,tumor,reference,exomeRegions],strelka2configurationCMD) strelka2runCMD = "python $SOURCE/runWorkflow.py -m local"+" -j {}".format(processors)+" > $TARGET" strelka2run = env.Command(["Variants/Strelka2/{}_{}/00_strelka2.done.txt".format(tumor,normal)],[strelka2configuration],strelka2runCMD) selectVariantsStrelka2snvsCMD = "${SOURCES[0]}"+" SelectVariants -R ${SOURCES[1]}"+" -V Variants/Strelka2/{}_{}/strelkaWorkflow/results/variants/somatic.snvs.vcf.gz".format(tumor,normal)+" -O $TARGET --exclude-filtered" selectVariantsStrelka2snvs = env.Command(["Variants/Strelka2/{}_{}/01_strelka2_snvs_pass.vcf".format(tumor,normal)],[gatk4,reference,strelka2run],selectVariantsStrelka2snvsCMD) selectVariantsStrelka2indelsCMD = "${SOURCES[0]}"+" SelectVariants -R ${SOURCES[1]}"+" -V Variants/Strelka2/{}_{}/strelkaWorkflow/results/variants/somatic.indels.vcf.gz".format(tumor,normal)+" -O $TARGET --exclude-filtered" selectVariantsStrelka2indels = env.Command(["Variants/Strelka2/{}_{}/02_strelka2_indels_pass.vcf".format(tumor,normal)],[gatk4,reference,selectVariantsStrelka2snvs],selectVariantsStrelka2indelsCMD) #Annotation by snpEff strelka2EFFCMD = "java -jar /tmp/snpEff/snpEff.jar eff -v {} -stats Variants/Strelka2/{}_{}/snpEff_summary.html -dataDir {} $SOURCE > $TARGET".format(snpeffGenomeVersion,tumor,normal,snpeffDir) strelka2EFF = env.Command(["Variants/Strelka2/{}_{}/03_strelka2_snvs_snpeff.vcf".format(tumor,normal), "Variants/Strelka2/{}_{}/snpEff_summary.genes.txt".format(tumor,normal), "Variants/Strelka2/{}_{}/snpEff_summary.html".format(tumor,normal)],[selectVariantsStrelka2snvs], strelka2EFFCMD ) strelka2indelsEFFCMD = "java -jar /tmp/snpEff/snpEff.jar eff -v {} -stats Variants/Strelka2/{}_{}/snpEff_indels_summary.html -dataDir {} $SOURCE > $TARGET".format(snpeffGenomeVersion,tumor,normal,snpeffDir) strelka2indelsEFF = env.Command(["Variants/Strelka2/{}_{}/03_strelka2_indels_snpeff.vcf".format(tumor,normal), "Variants/Strelka2/{}_{}/snpEff_indels_summary.genes.txt".format(tumor,normal), "Variants/Strelka2/{}_{}/snpEff_indels_summary.html".format(tumor,normal)],[selectVariantsStrelka2indels], strelka2indelsEFFCMD ) # Annotation with ClinVar strelka2clinvarCmd = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate -info CLNSIG,CLNDN {} $SOURCE > $TARGET".format(clinvar) strelka2clinvar = env.Command(["Variants/Strelka2/{}_{}/04_strelka2_snvs_clinvar.vcf".format(tumor,normal)], [strelka2EFF], strelka2clinvarCmd) strelka2indelsclinvarCmd = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate -info CLNSIG,CLNDN {} $SOURCE > $TARGET".format(clinvar) strelka2indelsclinvar = env.Command(["Variants/Strelka2/{}_{}/04_strelka2_indels_clinvar.vcf".format(tumor,normal)], [strelka2indelsEFF], strelka2indelsclinvarCmd) # Cosmic coding variants annotation cosmicCodingCmd = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(cosmic) cosmicCoding = env.Command(["Variants/Strelka2/{}_{}/05_strelka2_snvs_cosmic.vcf".format(tumor,normal)], [strelka2EFF], cosmicCodingCmd) cosmicindelsCodingCmd = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(cosmic) cosmicindelsCoding = env.Command(["Variants/Strelka2/{}_{}/05_strelka2_indels_cosmic.vcf".format(tumor,normal)], [strelka2indelsclinvar], cosmicindelsCodingCmd) # dbSNP annotation strelka2DbsnpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(dbsnpVCF) strelka2Dbsnp = env.Command(["Variants/Strelka2/{}_{}/06_strelka2_snvs_dbsnp.vcf".format(tumor,normal)], [cosmicCoding], strelka2DbsnpCMD) strelka2indelsDbsnpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(dbsnpVCF) strelka2indelsDbsnp = env.Command(["Variants/Strelka2/{}_{}/06_strelka2_indels_dbsnp.vcf".format(tumor,normal)], [cosmicindelsCoding], strelka2indelsDbsnpCMD) # snps gnomAD annotation strelka2gnomadCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(gnomadVCF) strelka2gnomad = env.Command(["Variants/Strelka2/{}_{}/07_strelka2_snvs_gnomad.vcf".format(tumor,normal)], [strelka2Dbsnp], strelka2gnomadCMD) strelka2GenotypeConversionCMD = "update_GT_Strelka2.py -v $SOURCE -t snps > $TARGET" strelka2GenotypeConversion = env.Command(["Variants/Strelka2/{}_{}/08_strelka2_snvs_gt.vcf".format(tumor,normal)], [strelka2gnomad], strelka2GenotypeConversionCMD) # Indels gnomad annotation strelka2indelsgnomadCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(gnomadVCF) strelka2indelsgnomad = env.Command(["Variants/Strelka2/{}_{}/07_strelka2_indels_gnomad.vcf".format(tumor,normal)], [strelka2indelsDbsnp], strelka2indelsgnomadCMD) strelka2GenotypeConversionIndelsCMD = "update_GT_Strelka2.py -v $SOURCE -t indels > $TARGET" strelka2GenotypeConversionIndels = env.Command(["Variants/Strelka2/{}_{}/08_strelka2_indels_gt.vcf".format(tumor,normal)], [strelka2indelsgnomad], strelka2GenotypeConversionIndelsCMD) # Move latest created vcf of snps to VCF directory copyStrelka2SnpsAnnotatedCMD = "cp $SOURCE $TARGET" copyStrelka2SnpsAnnotated = env.Command(["VCF/{}_{}_strelka2_snps.vcf".format(tumor,normal)], [strelka2GenotypeConversion], copyStrelka2SnpsAnnotatedCMD) # Move latest created vcf of indels to VCF directory copyStrelka2IndelsAnnotatedCMD = "cp $SOURCE $TARGET" copyStrelka2IndelsAnnotated = env.Command(["VCF/{}_{}_strelka2_indels.vcf".format(tumor,normal)], [strelka2GenotypeConversionIndels], copyStrelka2IndelsAnnotatedCMD) ###################################### #### Variant calling with VarScan #### ###################################### if "varscan" in variantCallersList: varscanCMD = 'java -jar /tmp/VarScan.jar somatic ' \ ' <(samtools mpileup -f ${SOURCES[0]} /working/${SOURCES[1]}/07_recalibrated.bam) ' \ ' <(samtools mpileup -f ${SOURCES[0]} /working/${SOURCES[2]}/07_recalibrated.bam) ' + \ ' Variants/VarScan/{}_{}/01_varscan ' \ ' --min-coverage 10 ' \ ' --somatic-p-value 0.05 ' \ ' --p-value 0.99 ' \ ' --min-var-freq 0.02 {}'.format(tumor,normal,varScanParameters) varscan = env.Command(["Variants/VarScan/{}_{}/01_varscan.indel".format(tumor,normal), \ "Variants/VarScan/{}_{}/01_varscan.snp".format(tumor,normal)], \ [reference,normal,tumor], \ varscanCMD) filterVarScanSnpsCMD = 'java -jar /tmp/VarScan.jar somaticFilter ' \ 'Variants/VarScan/{}_{}/01_varscan.snp ' \ ' --min-coverage {} ' \ ' --pvalue {} ' \ ' --min-var-freq {} ' \ ' --output-file Variants/VarScan/{}_{}/02_snp ' \ ' && cp Variants/VarScan/{}_{}/02_snp '.format(tumor,normal,varScanMinCoverage,varScanPvalue,varScanMinVarFreq,tumor,normal,tumor,normal) + \ '${TARGET}' + ' && rm Variants/VarScan/{}_{}/02_snp'.format(tumor,normal) filterVarScanSnps = env.Command(["Variants/VarScan/{}_{}/02_varscan_snp_filtered.snp".format(tumor,normal)], \ [reference,normal,tumor,varscan], \ filterVarScanSnpsCMD) filterVarScanIndelCMD = 'java -jar /tmp/VarScan.jar somaticFilter ' \ 'Variants/VarScan/{}_{}/01_varscan.indel ' \ ' --min-coverage {} ' \ ' --pvalue {} ' \ ' --min-var-freq {} ' \ ' --output-file Variants/VarScan/{}_{}/02_indel && cp Variants/VarScan/{}_{}/02_indel '.format(tumor,normal,varScanMinCoverage,varScanPvalue,varScanMinVarFreq,tumor,normal,tumor,normal) + \ '${TARGET}' + \ ' && rm Variants/VarScan/{}_{}/02_indel'.format(tumor,normal) filterVarScanIndel = env.Command(["Variants/VarScan/{}_{}/02_varscan_indel_filtered.snp".format(tumor,normal)], [reference,normal,tumor,varscan], filterVarScanIndelCMD) snpsToVCFCMD = 'python /tmp/vs_format_converter.py ' \ 'Variants/VarScan/{}_{}/02_varscan_snp_filtered.snp > $TARGET'.format(tumor,normal) snpsToVCF = env.Command(["Variants/VarScan/{}_{}/03_varscan_snp_filtered.vcf".format(tumor,normal)], [filterVarScanSnps], snpsToVCFCMD) snpsToIndelsCMD = 'python /tmp/vs_format_converter.py ' \ 'Variants/VarScan/{}_{}/02_varscan_indel_filtered.snp ' \ '> $TARGET'.format(tumor,normal) snpsToIndels = env.Command(["Variants/VarScan/{}_{}/03_varscan_indels_filtered.vcf".format(tumor,normal)], [filterVarScanIndel], snpsToIndelsCMD) #Annotation by snpEff varscansnpEFFCMD = "java -jar /tmp/snpEff/snpEff.jar eff -v {} -stats Variants/VarScan/{}_{}/snpEff_summary.html -dataDir {} $SOURCE > $TARGET".format(snpeffGenomeVersion,tumor,normal,snpeffDir) varscansnpEFF = env.Command(["Variants/VarScan/{}_{}/04_varscan_snp_snpeff.vcf".format(tumor,normal), "Variants/VarScan/{}_{}/snpEff_summary.genes.txt".format(tumor,normal), "Variants/VarScan/{}_{}/snpEff_summary.html".format(tumor,normal)],[snpsToVCF],varscansnpEFFCMD) varscanindelsEFFCMD = "java -jar /tmp/snpEff/snpEff.jar eff -v {} -stats Variants/VarScan/{}_{}/snpEff_indels_summary.html -dataDir {} $SOURCE > $TARGET".format(snpeffGenomeVersion,tumor,normal,snpeffDir) varscanindelsEFF = env.Command(["Variants/VarScan/{}_{}/04_varscan_indels_snpeff.vcf".format(tumor,normal), "Variants/VarScan/{}_{}/snpEff_indels_summary.genes.txt".format(tumor,normal), "Variants/VarScan/{}_{}/snpEff_indels_summary.html".format(tumor,normal)],[snpsToIndels],varscanindelsEFFCMD) # Annotation with ClinVar varscanclinvarCMD = "java -Xmx2g -jar /tmp/snpEff/SnpSift.jar annotate -info CLNSIG,CLNDN {} $SOURCE > $TARGET".format(clinvar) varscanclinvar = env.Command(["Variants/VarScan/{}_{}/05_varscan_snp_clinvar.vcf".format(tumor,normal)], [varscansnpEFF], varscanclinvarCMD) varscanindelsclinvarCMD = "java -Xmx2g -jar /tmp/snpEff/SnpSift.jar annotate -info CLNSIG,CLNDN {} $SOURCE > $TARGET".format(clinvar) varscanindelsclinvar = env.Command(["Variants/VarScan/{}_{}/05_varscan_indels_clinvar.vcf".format(tumor,normal)], [varscanindelsEFF], varscanindelsclinvarCMD) # Cosmic coding variants annotation varscanCosmicCodingCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(cosmic) varscanCosmicCoding = env.Command(["Variants/VarScan/{}_{}/06_varscan_snp_cosmic.vcf".format(tumor,normal)], [varscanclinvar], varscanCosmicCodingCMD) varscanindelsCosmicCodingCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(cosmic) varscanindelsCosmicCoding = env.Command(["Variants/VarScan/{}_{}/06_varscan_indels_cosmic.vcf".format(tumor,normal)], [varscanindelsclinvar], varscanindelsCosmicCodingCMD) # dbSNP annotation varscanDbsnpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(dbsnpVCF) varscanDbsnp = env.Command(["Variants/VarScan/{}_{}/07_varscan_snp_dbsnp.vcf".format(tumor,normal)], [varscanCosmicCoding], varscanDbsnpCMD) varscanindelsDbsnpCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(dbsnpVCF) varscanindelsDbsnp = env.Command(["Variants/VarScan/{}_{}/07_varscan_indels_dbsnp.vcf".format(tumor,normal)], [varscanindelsCosmicCoding], varscanindelsDbsnpCMD) # gnomAD annotation varscangnomadCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(gnomadVCF) varscangnomad = env.Command(["Variants/VarScan/{}_{}/08_varscan_snp_gnomad.vcf".format(tumor,normal)], [varscanDbsnp], varscangnomadCMD) varscanindelsgnomadCMD = "java -Xmx4g -jar /tmp/snpEff/SnpSift.jar annotate {} $SOURCE > $TARGET".format(gnomadVCF) varscanindelsgnomad = env.Command(["Variants/VarScan/{}_{}/08_varscan_indels_gnomad.vcf".format(tumor,normal)], [varscanindelsDbsnp], varscanindelsgnomadCMD) # Move latest created vcf of snps to VCF directory copyVarscanAnnotatedCMD = "cp $SOURCE $TARGET" copyVarscanAnnotated = env.Command(["VCF/{}_{}_varscan_snps.vcf".format(tumor,normal)], [varscangnomad], copyVarscanAnnotatedCMD) # Move latest created vcf of indels to VCF directory copyVarscanAnnotatedIndelsCMD = "cp $SOURCE $TARGET" copyVarscanAnnotatedIndels = env.Command(["VCF/{}_{}_varscan_indels.vcf".format(tumor,normal)], [varscanindelsgnomad], copyVarscanAnnotatedIndelsCMD) # =========================================================================================== variantCallersString = "java -Xmx4g -jar /tmp/gatk3/GenomeAnalysisTK.jar -T CombineVariants -R {} ".format(reference) variantCallersIndelsString = variantCallersString lastStepOfVariantCallerList = [] lastStepOfIndelCallerList = [] lastStepOfVariantCaller = {} lastStepOfIndelVariantCaller = {} for variantCaller in variantCallersList: if variantCaller == "varscan": lastStepOfVariantCaller["varscan"] = copyVarscanAnnotated[0] lastStepOfIndelVariantCaller["varscan"] = copyVarscanAnnotatedIndels[0] if variantCaller == "strelka2": lastStepOfVariantCaller["strelka2"] = copyStrelka2SnpsAnnotated[0] lastStepOfIndelVariantCaller["strelka2"] = copyStrelka2IndelsAnnotated[0] if variantCaller == "mutect2": lastStepOfVariantCaller["mutect2"] = copyMutect2AnnotatedSnps[0] lastStepOfIndelVariantCaller["mutect2"] = copyMutect2AnnotatedIndels[0] for variantCaller in variantCallersList: variantCallersString = variantCallersString + " --variant:{} /working/VCF/{}_{}_{}_snps.vcf ".format(variantCaller,tumor,normal,variantCaller) lastStepOfVariantCallerList.append(lastStepOfVariantCaller.get(variantCaller)) variantCallersIndelsString = variantCallersIndelsString + " --variant:{} /working/VCF/{}_{}_{}_indels.vcf ".format(variantCaller,tumor,normal,variantCaller) lastStepOfIndelCallerList.append(lastStepOfIndelVariantCaller.get(variantCaller)) variantCallersList.sort() lastStepOfIndelCallerList.sort() variantCallerListIndels = variantCallersList[0:] variantCallersStringCMD = variantCallersString + " -o VCF/Combined_VCFs_by_sample/{}_{}_merged_snps.vcf -genotypeMergeOptions PRIORITIZE -priority {}".format(tumor,normal, ",".join(variantCallersList)) combineVCFsSnps = env.Command(["VCF/Combined_VCFs_by_sample/{}_{}_merged_snps.vcf".format(tumor,normal)], lastStepOfVariantCallerList, variantCallersStringCMD) variantCallersIndelsString = variantCallersIndelsString + " -o VCF/Combined_VCFs_by_sample/{}_{}_merged_indels.vcf -genotypeMergeOptions PRIORITIZE -priority {}".format(tumor,normal,",".join(variantCallerListIndels)) combineVCFsIndels = env.Command(["VCF/Combined_VCFs_by_sample/{}_{}_merged_indels.vcf".format(tumor,normal)], lastStepOfIndelCallerList, variantCallersIndelsString )