--- # Configuration file defining biological data to retrieve and install. # These are stored in an Amazon S3 bucket: # https://s3.amazonaws.com/biodata # and retrieved using the data_fabfile Fabric script. # ## Genome data -- Next generation sequencing and Galaxy # Details about the genomes you want to include. # Required genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns): # dbkey - globally unique identifier for the genome (e.g., hg19) # name - descriptive name for the given genome (to be displayed in Galaxy, e.g., Hsapiens) # Optional genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns): # formats, species, dbkey1, dbkey2, value, path, index # Additional genome fields specific to data deployment: # genome_indexes - list of tool indexes specific to the associated genome (overrides global 'genome_indexes') genomes: - dbkey: phix name: phiX174 - dbkey: hg19 name: Human (hg19) indexes: [seq, twobit] annotations: [GA4GH_problem_regions, capture_regions, MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps, mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase, genesplicer, effects_transcripts, vcfanno, viral] annotations_available: [battenberg, dbnsfp, dbscsnv] validation: [giab-NA12878, platinum-genome-NA12878, giab-NA24385, giab-NA24631] - dbkey: GRCh37 name: Human (GRCh37) indexes: [seq, twobit] annotations: [GA4GH_problem_regions, capture_regions, MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps, mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase, genesplicer, effects_transcripts, vcfanno, viral] annotations_available: [battenberg, dbnsfp, dbscsnv] validation: [giab-NA12878, giab-NA24385, giab-NA24631, dream-syn3, dream-syn4] - dbkey: hg38 name: Human (hg38) full indexes: [seq, twobit, bwa, hisat2] annotations: [coverage, prioritize, dbsnp, hapmap_snps, 1000g_omni_snps, 1000g_snps, 1000g_indels, mills_indels, clinvar, qsignature, ACMG56_genes, transcripts, genesplicer, effects_transcripts, vcfanno, esp, exac, viral] annotations_available: [dbnsfp, dbscsnv] validation: [giab-NA12878, giab-NA24385, giab-NA24631, platinum-genome-NA12878, giab-NA12878-remap, giab-NA12878-crossmap, dream-syn4-crossmap, dream-syn3-crossmap] - dbkey: hg38-noalt name: Human (hg38) without alternative alleles annotations: [coverage, dbsnp, hapmap_snps, 1000g_omni_snps, 1000g_snps, 1000g_indels, mills_indels, clinvar, transcripts] annotations_available: [dbnsfp, dbscsnv] - dbkey: mm9 name: Mouse (mm9) - dbkey: mm10 name: Mouse (mm10) annotations: [problem_regions, dbsnp, transcripts, mirbase] - dbkey: rn5 name: Rat (rn5) - dbkey: rn6 name: Rat (rn6) annotations: [transcripts, mirbase] - dbkey: canFam3 name: Dog (canFam3) annotations: [dbsnp, transcripts] - dbkey: galGal4 name: Chicken (galGal4) - dbkey: dm3 name: D melangogaster (dm3) - dbkey: TAIR10 name: Arabidopsis thaliana (TAIR10) annotations: [mirbase] - dbkey: xenTro3 name: X tropicalis (xenTro3) - dbkey: GRCz10 name: Zebrafish (GRCz10) - dbkey: Zv9 name: Zebrafish (Zv9) - dbkey: sacCer3 name: S cerevisiae (sacCer3) - dbkey: WBcel235 name: C elegans (WBcel235) - dbkey: pseudomonas_aeruginosa_ucbpp_pa14 name: Pseudomonas aeruginosa UCBPP-PA14 # High level targets for specifying annotations annotation_groups: variation: [problem_regions, GA4GH_problem_regions, capture_regions, MIG, coverage, prioritize, dbsnp, hapmap, hapmap_snps, 1000g_omni_snps, ACMG56_genes, 1000g_snps, mills_indels, 1000g_indels, clinvar, cosmic, ancestral, qsignature, genesplicer, effects_transcripts, vcfanno, viral] rnaseq: [transcripts, RADAR] smallrna: [mirbase] gemini: [esp, exac] # Global set of indexes to include for each genome. # Available choices are in GENOMES_INDEXES_SUPPORTED in cloudbio/biodata/genomes.py genome_indexes: - bwa - twobit # Additional data targets install_liftover: false install_uniref: false