@prefix dcterms: . @prefix ns1: . @prefix xsd: . <#ont____assembly_flye_ahrenslab-inputs-ftp://biftp.informatik.uni-freiburg.de/pub/T0/Ahrens/SRR6982805.fastq> a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ftp://biftp.informatik.uni-freiburg.de/pub/T0/Ahrens/SRR6982805.fastq" . <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8811145> a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "https://ndownloader.figshare.com/files/8811145" . <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8811148> a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "https://ndownloader.figshare.com/files/8811148" . <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8812159> a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "https://ndownloader.figshare.com/files/8812159" . a ns1:Person ; ns1:name "Fernando Vázquez-Novoa" . a ns1:Person ; ns1:name "Sebastian Ares de Parga Regalado" . a ns1:Person ; ns1:name "Adam M. Novak" . a ns1:Person ; ns1:name "Delphine Lariviere" . a ns1:Person ; ns1:name "Fani Hatjina" . a ns1:Person ; ns1:name "Justin Lemkul" . a ns1:Person ; ns1:name "Kamshat Temirbayeva" . a ns1:Person ; ns1:name "Slađan Rašić" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:Person ; ns1:name "Viktoria Isabel Schwarz" . a ns1:Person ; ns1:name "Christoph Steinbeck" . a ns1:Person ; ns1:name "Nina Van Goethem" . a ns1:Person ; ns1:name "Maria Bouga" . a ns1:Person ; ns1:name "Konstantinos Kyritsis" . a ns1:Person ; ns1:name "Bruno P. Kinoshita" . a ns1:Person ; ns1:name "Maria Sorokina" . a ns1:Person ; ns1:name "Zlatko Puškadija" . a ns1:Person ; ns1:name "M. Alice Pinto" . a ns1:Person ; ns1:name "Andrea Guarracino" . a ns1:Person ; ns1:name "Johannes Köster" . a ns1:Person ; ns1:name "Bugra Oezdemir" . a ns1:Person ; ns1:name "Santiago Royo-Sierra" . a ns1:Person ; ns1:name "Nandan Deshpande" . a ns1:Person ; ns1:name "Andrzej Oleksa" . a ns1:Person ; ns1:name "Pedro João Rodrigues" . a ns1:Person ; ns1:name "Franck Dedeine" . a ns1:Person ; ns1:name "Enrique Bernal-Delgado" . a ns1:Person ; ns1:name "Leonidas Charistos" . a ns1:Person ; ns1:name "Max Schubach" . a ns1:Person ; ns1:name "Hans-Rudolf Hotz" . a ns1:Person ; ns1:name "Irfan Kandemir" . a ns1:Person ; ns1:name "Claudio Satriano" . a ns1:Person ; ns1:name "Aibyn Torekhanov" . a ns1:Person ; ns1:name "Luc Cornet" . a ns1:Person ; ns1:name "Vincent Hervé" . a ns1:Person ; ns1:name "Mridul Johari" . a ns1:Person ; ns1:name "Marin Kovačić" . a ns1:Person ; ns1:name "Mehmet Tekman" . a ns1:Person ; ns1:name "Kristian Peters" . a ns1:Person ; ns1:name "Marjan Meurisse" . a ns1:Person ; ns1:name "Zhanar Sheralieva" . a ns1:Person ; ns1:name "Jose Raul Bravo Martinez" . a ns1:Person ; ns1:name "Eliza Căuia" . a ns1:Person ; ns1:name "Bonface Munyoki" . a ns1:Person ; ns1:name "Szilvia Kusza" . a ns1:Person ; ns1:name "Pierre Osteil" . a ns1:Person ; ns1:name "Francisco Estupiñan-Romero" . a ns1:Person ; ns1:name "Natalia Poiata" . a ns1:Person ; ns1:name "Daniel Blankenberg" . a ns1:Person ; ns1:name "Ulzhan Nuralieva" . a ns1:Person ; ns1:name "Isaure Chauvot de Beauchêne" . a ns1:Person ; ns1:name "Daphne Wijnbergen" . a ns1:Person ; ns1:name "Denys Savchenko" . a ns1:Person ; ns1:name "Pjotr Prins" . a ns1:Person ; ns1:name "Javier González Galindo" . a ns1:Person ; ns1:name "Michael Heuer" . a ns1:Person ; ns1:name "Laurent Gil" . a ns1:Person ; ns1:name "Natalia Martínez-Lizaga" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Riccardo Rossi Bernecoli" . a ns1:Person ; ns1:name "Adrian Siceanu" . a ns1:Person ; ns1:name "Diego Carvalho" . a ns1:Person ; ns1:name "Nachida Tadrent" . a ns1:Person ; ns1:name "Janez Prešern" . a ns1:Person ; ns1:name "Leandro Liborio" . a ns1:Person ; ns1:name "Damon-Lee Pointon" . a ns1:Person ; ns1:name "Thomas Liener" . a ns1:Person ; ns1:name "Marina Kennerson" . a ns1:Person ; ns1:name "Peter Amstutz" . a ns1:Person ; ns1:name "Jorge Ejarque" . a ns1:Person ; ns1:name "Ying Sims" . a ns1:Person ; ns1:name "Will Eagles" . a ns1:Collection ; ns1:name "Sanger Tree of Life Assembly method" . a ns1:Collection ; ns1:name "ERGA Assembly Snakemake HiFi & HiC Pipelines" . a ns1:Collection ; ns1:name "Genome Assembly Workflows for ERGA-BGE genomes" . a ns1:Collection ; ns1:name "IDR searcher" . a ns1:Person ; ns1:name "Sarah Beecroft" . a ns1:Person ; ns1:name "Marco De La Pierre" . a ns1:Person ; ns1:name "Julian Uszkoreit" . a ns1:Person ; ns1:name "Camille Juigné" . a ns1:Person ; ns1:name "Richard Lupat" . a ns1:Person ; ns1:name "Johannes Köster" . a ns1:Person ; ns1:name "Valentin Tilloy" . a ns1:Person ; ns1:name "Michael Roach" . a ns1:Person ; ns1:name "Elida Schneltzer" . a ns1:Person ; ns1:name "Stian Soiland-Reyes" . a ns1:Person ; ns1:name "Luiz Gadelha" . a ns1:Person ; ns1:name "Asier Gonzalez-Uriarte" . a ns1:Person ; ns1:name "Woosub Shin" . a ns1:Person ; ns1:name "Ryan Patterson-Cross" . a ns1:Person ; ns1:name "Anthony Bretaudeau" . a ns1:Person ; ns1:name "Haris Zafeiropoulos" . a ns1:Person ; ns1:name "Hrishikesh Dhondge" . a ns1:Person ; ns1:name "Kim Philipp Jablonski" . a ns1:Person ; ns1:name "Pasi Korhonen" . a ns1:Person ; ns1:name "Marvin Martens" . a ns1:Person ; ns1:name "Tazro Inutano" . a ns1:Person ; ns1:name "Marlene Rezk" . a ns1:Person ; ns1:name "Casper de Visser" . a ns1:Person ; ns1:name "Jasper Ouwerkerk" . a ns1:Person ; ns1:name "Benjamin Wingfield" . a ns1:Person ; ns1:name "Anna Niehues" . a ns1:Person ; ns1:name "Marie-Dominique Devignes" . a ns1:Person ; ns1:name "Samuel Lambert" . a ns1:Person ; ns1:name "Alejandra Escobar" . a ns1:Person ; ns1:name "Tatiana Gurbich" . a ns1:Person ; ns1:name "Katherine Farquharson" . a ns1:Person ; ns1:name "Lucas Cruz" . a ns1:Person ; ns1:name "Pablo Riesgo Ferreiro" . a ns1:Person ; ns1:name "Felicita Gernhardt" . a ns1:Person ; ns1:name "Andrey Prjibelski" . a ns1:Person ; ns1:name "Xiaokang Zhang" . a ns1:Person ; ns1:name "Zavolan Lab" . a ns1:Person ; ns1:name "Mahnoor Zulfiqar" . a ns1:Person ; ns1:name "Veit Schwämmle" . a ns1:Person ; ns1:name "Anton Korobeynikov" . a ns1:Person ; ns1:name "Tom Brown" . a ns1:Person ; ns1:name "Luiz Gadelha" . a ns1:Person ; ns1:name "Vasiliki Panagi" . a ns1:Person ; ns1:name "Willem de Koning" . a ns1:Person ; ns1:name "Jorrit Mesman" . a ns1:Person ; ns1:name "Elisabetta Spinazzola" . a ns1:Person ; ns1:name "Cristian Tatu" . a ns1:Person ; ns1:name "Agata Kilar" . a ns1:Person ; ns1:name "Jessica Gomez-Garrido" . a ns1:Person ; ns1:name "Sagane Joye-Dind" . a ns1:Person ; ns1:name "Pau Andrio" . a ns1:Person ; ns1:name "Akshay Akshay" . a ns1:Person ; ns1:name "Phuong Doan" . a ns1:Person ; ns1:name "Davide Gurnari" . a ns1:Person ; ns1:name "Johan Gustafsson" . a ns1:Person ; ns1:name "David Yuan" . a ns1:Person ; ns1:name "Justin Sonneck" . a ns1:Person ; ns1:name "Zafran Hussain Shah" . a ns1:Person ; ns1:name "Fernando Vázquez-Novoa" . a ns1:Person ; ns1:name "Nandan Deshpande" . a ns1:Person ; ns1:name "Michael Hall" . a ns1:Person ; ns1:name "Rafael Terra" . a ns1:Person ; ns1:name "Peter van Heusden" . a ns1:Person ; ns1:name "Ian Brennan" . a ns1:Person ; ns1:name "Bryan Raubenolt" . a ns1:Person ; ns1:name "Matthias Haimel" . a ns1:Person ; ns1:name "Carlos Oscar Sorzano Sanchez" . a ns1:Person ; ns1:name "Paul Brack" . a ns1:Person ; ns1:name "Wendi Bacon" . a ns1:Person ; ns1:name "Romane Libouban" . a ns1:Organization, ns1:Project ; ns1:name "CAPSID" . a ns1:Organization, ns1:Project ; ns1:name "KircherLab" . a ns1:Organization, ns1:Project ; ns1:name "BCCM_ULC" . a ns1:Organization, ns1:Project ; ns1:name "Computational Science at HZDR" . a ns1:Organization, ns1:Project ; ns1:name "Big data in biomedicine" . a ns1:Organization, ns1:Project ; ns1:name "TRE-FX" . a ns1:Organization, ns1:Project ; ns1:name "NMR Workflow" . a ns1:Organization, ns1:Project ; ns1:name "Guigó lab" . a ns1:Organization, ns1:Project ; ns1:name "Statistical genetics" . a ns1:Organization, ns1:Project ; ns1:name "Delineating Regions-of-interest for Mass Spectrometry Imaging by Multimodally Corroborated Spatial Segmentation" . a ns1:Organization, ns1:Project ; ns1:name "Bioinformatics Unit @ CRG" . a ns1:Organization, ns1:Project ; ns1:name "BSC-CES" . a ns1:Organization, ns1:Project ; ns1:name "ELIXIR Proteomics" . a ns1:Organization, ns1:Project ; ns1:name "IRRI Bioinformatics Group" . a ns1:Organization, ns1:Project ; ns1:name "Zavolan Lab" . a ns1:Organization, ns1:Project ; ns1:name "Metabolomics-Reproducibility" . a ns1:Organization, ns1:Project ; ns1:name "NGFF Tools" . a ns1:Organization, ns1:Project ; ns1:name "Bioinformatics workflows for life science" . a ns1:Organization, ns1:Project ; ns1:name "Helmholtz Scientific Project Workflow Platform" . a ns1:Organization, ns1:Project ; ns1:name "BY-COVID Baseline Use Case: SARS-CoV-2 Vaccine(s) effectiveness in preventing SARS-CoV-2 infection" . a ns1:Organization, ns1:Project ; ns1:name "Lake Erken modelling setup" . a ns1:Organization, ns1:Project ; ns1:name "EOSC-Life WP3 OC Team, cross RI project" . a ns1:Organization, ns1:Project ; ns1:name "ARA-dev" . a ns1:Organization, ns1:Project ; ns1:name "Mendel Centre for Plant Genomics and Proteomics" . a ns1:Organization, ns1:Project ; ns1:name "Metagenomic tools" . a ns1:Organization, ns1:Project ; ns1:name "Pillar I: Manufacturing" . a ns1:Organization, ns1:Project ; ns1:name "Polygenic Score Catalog" . a ns1:Organization, ns1:Project ; ns1:name "MLme: Machine Learning Made Easy" . a ns1:Organization, ns1:Project ; ns1:name "Dioscuri TDA" . a ns1:Organization, ns1:Project ; ns1:name "MMV-Lab" . a ns1:Organization, ns1:Project ; ns1:name "Tree of Life Genome Assembly" . a ns1:Organization, ns1:Project ; ns1:name "EBP-Nor" . a ns1:Organization, ns1:Project ; ns1:name "Evaluation of Swin Transformer and knowledge transfer for denoising of super-resolution structured illumination microscopy data" . a ns1:Organization, ns1:Project ; ns1:name "COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource" . a ns1:Organization, ns1:Project ; ns1:name "Bioinformatics Laboratory for Genomics and Biodiversity (LBGB)" . a ns1:Organization, ns1:Project ; ns1:name "Pangenome database project" . a ns1:Organization, ns1:Project ; ns1:name "HP2NET - Framework for construction of phylogenetic networks on High Performance Computing (HPC) environment" . a ns1:Organization, ns1:Project ; ns1:name "Generalized Open-Source Workflows for Atomistic Molecular Dynamics Simulations of Viral Helicases" . a ns1:Organization, ns1:Project ; ns1:name "Italy-Covid-data-Portal" . a ns1:Organization, ns1:Project ; ns1:name "EOSC-Life-WP6-Demos" . a ns1:Organization, ns1:Project ; ns1:name "EOSC-Life WP3" . a ns1:Organization, ns1:Project ; ns1:name "MOLGENIS" . a ns1:Organization, ns1:Project ; ns1:name "Janis" . a ns1:Organization, ns1:Project ; ns1:name "IAA-CSIC" . a ns1:Organization, ns1:Project ; ns1:name "ODA" . a ns1:Organization, ns1:Project ; ns1:name "FAME" . a ns1:Organization, ns1:Project ; ns1:name "CHU Limoges - UF9481 Bioinformatique / CNR Herpesvirus" . a ns1:Organization, ns1:Project ; ns1:name "HecatombDevelopment" . a ns1:Organization, ns1:Project ; ns1:name "OpenEBench" . a ns1:Organization, ns1:Project ; ns1:name "Bioinformatics and Biostatistics (BIO2 ) Core" . a ns1:Organization, ns1:Project ; ns1:name "VIB Bioinformatics Core" . a ns1:Organization, ns1:Project ; ns1:name "Single Cell Unit" . a ns1:Organization, ns1:Project ; ns1:name "emo-bon" . a ns1:Organization, ns1:Project ; ns1:name "CINECA" . a ns1:Organization, ns1:Project ; ns1:name "Toxicology community" . a ns1:Organization, ns1:Project ; ns1:name "Gyn Department" . a ns1:Organization, ns1:Project ; ns1:name "Medizinisches Proteom-Center, Medical Bioinformatics" . a ns1:Organization, ns1:Project ; ns1:name "AGRF BIO" . a ns1:Organization, ns1:Project ; ns1:name "X-omics" . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-02-04T09:05:08Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "Non-functional workflow to get a global view of possibilities for plant virus classification." ; ns1:keywords "Virus, identification, exploration" ; ns1:license ; ns1:name "0: View complete virus identification" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Data_R1.fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Data_R2.fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_taxonomy_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_taxonomy_table_modified" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Taxonomic_prediction_report" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-02-04T09:07:38Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "Metagenomic dataset taxonomic classification using kraken2" ; ns1:input , ; ns1:keywords "Virology, kraken" ; ns1:license ; ns1:name "1: Plant virus detection with kraken2 (PE)" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Data_R1.fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Data_R2.fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired-end forward" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired-end reverse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read_mapping_alignement" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blastn result" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapped_read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_fasta" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-02-04T09:10:04Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "Mapping against all plant virus then make contig out of the mapped reads then blast them." ; ns1:input , ; ns1:keywords "Virology, mapping, Assembly, reads_selection, blast" ; ns1:license ; ns1:name "2: Plant virus confirmation" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Shovill on input dataset(s) Log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Shovill on input dataset(s): Contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp on input dataset(s): HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp on input dataset(s): Read 1 output" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-02-04T09:11:37Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "Just the cleaning then assembly of all reads. TO explore further follow one of the paths described in \"Global view\" (WF 0) " ; ns1:keywords "Virology, exploration, DE_NOVO" ; ns1:license ; ns1:name "3: Plant virus exploration" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Alexey Sokolov" . a ns1:Person ; ns1:name "David F. Nieuwenhuijse" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-02-12T11:46:24Z"^^ns1:Date ; ns1:dateModified "2023-11-24T10:02:56Z"^^ns1:Date ; ns1:description "A workflow for mapping and consensus generation of SARS-CoV2 whole genome amplicon nanopore data implemented in the Nextflow framework. Reads are mapped to a reference genome using Minimap2 after trimming the amplicon primers with a fixed length at both ends of the amplicons using Cutadapt. The consensus is called using Pysam based on a majority read support threshold per position of the Minimap2 alignment and positions with less than 30x coverage are masked using ‘N’." ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "ENA SARS-CoV-2 Nanopore Amplicon Sequencing Analysis Workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Krisztian Papp" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-02-15T09:19:44Z"^^ns1:Date ; ns1:dateModified "2023-11-24T10:02:53Z"^^ns1:Date ; ns1:description "A pipeline for mapping, calling, and annotation of SARS-CoV2 variants." ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "ENA SARS-CoV2 Variant Calling" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-05-21T08:45:49Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:47:03Z"^^ns1:Date ; ns1:description """Rare disease researchers workflow is that they submit their raw data (fastq), run the mapping and variant calling RD-Connect pipeline and obtain unannotated gvcf files to further submit to the RD-Connect GPAP or analyse on their own. This demonstrator focuses on the variant calling pipeline. The raw genomic data is processed using the RD-Connect pipeline ([Laurie et al., 2016](https://www.ncbi.nlm.nih.gov/pubmed/27604516)) running on the standards (GA4GH) compliant, interoperable container orchestration platform. This demonstrator will be aligned with the current implementation study on [Development of Architecture for Software Containers at ELIXIR and its use by EXCELERATE use-case communities](docs/Appendix%201%20-%20Project%20Plan%202018-biocontainers%2020171117.pdf) For this implementation, different steps are required: 1. Adapt the pipeline to CWL and dockerise elements 2. Align with IS efforts on software containers to package the different components (Nextflow) 3. Submit trio of Illumina NA12878 Platinum Genome or Exome to the GA4GH platform cloud (by Aspera or ftp server) 4. Run the RD-Connect pipeline on the container platform 5. Return corresponding gvcf files 6. OPTIONAL: annotate and update to RD-Connect playground instance N.B: The demonstrator might have some manual steps, which will not be in production. ## RD-Connect pipeline Detailed information about the RD-Connect pipeline can be found in [Laurie et al., 2016](https://www.ncbi.nlm.nih.gov/pubmed/?term=27604516) ![alt text](https://raw.githubusercontent.com/inab/Wetlab2Variations/eosc-life/docs/RD-Connect_pipeline.jpg) ## The applications **1\\. Name of the application: Adaptor removal** Function: remove sequencing adaptors Container (readiness status, location, version): [cutadapt (v.1.18)](https://hub.docker.com/r/cnag/cutadapt) Required resources in cores and RAM: current container size 169MB Input data (amount, format, directory..): raw fastq Output data: paired fastq without adaptors **2\\. Name of the application: Mapping and bam sorting** Function: align data to reference genome Container : [bwa-mem (v.0.7.17)](https://hub.docker.com/r/cnag/bwa) / [Sambamba (v. 0.6.8 )](https://hub.docker.com/r/cnag/sambamba)(or samtools) Resources :current container size 111MB / 32MB Input data: paired fastq without adaptors Output data: sorted bam **3\\. Name of the application: MarkDuplicates** Function: Mark (and remove) duplicates Container: [Picard (v.2.18.25)](https://hub.docker.com/r/cnag/picard) Resources: current container size 261MB Input data:sorted bam Output data: Sorted bam with marked (or removed) duplicates **4\\. Name of the application: Base quality recalibration (BQSR)** Function: Base quality recalibration Container: [GATK (v.3.6-0)](https://hub.docker.com/r/cnag/gatk) Resources: current container size 270MB Input data: Sorted bam with marked (or removed) duplicates Output data: Sorted bam with marked duplicates & base quality recalculated **5\\. Name of the application: Variant calling** Function: variant calling Container: [GATK (v.3.6-0)](https://hub.docker.com/r/cnag/gatk) Resources: current container size 270MB Input data:Sorted bam with marked duplicates & base quality recalculated Output data: unannotated gvcf per sample **6\\. (OPTIONAL)Name of the application: Quality of the fastq** Function: report on the sequencing quality Container: [fastqc 0.11.8](https://hub.docker.com/r/cnag/fastqc) Resources: current container size 173MB Input data: raw fastq Output data: QC report ## Licensing GATK declares that archived packages are made available for free to academic researchers under a limited license for non-commercial use. If you need to use one of these packages for commercial use. https://software.broadinstitute.org/gatk/download/archive """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.106.3" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Nextflow, variant_calling" ; ns1:license ; ns1:name "VariantCaller_GATK3.6" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chromosome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gqb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "known_indels_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "known_sites_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "readgroup_str" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gvcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metrics" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-02-18T15:01:31Z"^^ns1:Date ; ns1:dateModified "2023-04-21T12:35:06Z"^^ns1:Date ; ns1:description """Rare disease researchers workflow is that they submit their raw data (fastq), run the mapping and variant calling RD-Connect pipeline and obtain unannotated gvcf files to further submit to the RD-Connect GPAP or analyse on their own. This demonstrator focuses on the variant calling pipeline. The raw genomic data is processed using the RD-Connect pipeline ([Laurie et al., 2016](https://www.ncbi.nlm.nih.gov/pubmed/27604516)) running on the standards (GA4GH) compliant, interoperable container orchestration platform. This demonstrator will be aligned with the current implementation study on [Development of Architecture for Software Containers at ELIXIR and its use by EXCELERATE use-case communities](docs/Appendix%201%20-%20Project%20Plan%202018-biocontainers%2020171117.pdf) For this implementation, different steps are required: 1. Adapt the pipeline to CWL and dockerise elements 2. Align with IS efforts on software containers to package the different components (Nextflow) 3. Submit trio of Illumina NA12878 Platinum Genome or Exome to the GA4GH platform cloud (by Aspera or ftp server) 4. Run the RD-Connect pipeline on the container platform 5. Return corresponding gvcf files 6. OPTIONAL: annotate and update to RD-Connect playground instance N.B: The demonstrator might have some manual steps, which will not be in production. ## RD-Connect pipeline Detailed information about the RD-Connect pipeline can be found in [Laurie et al., 2016](https://www.ncbi.nlm.nih.gov/pubmed/?term=27604516) ![alt text](https://raw.githubusercontent.com/inab/Wetlab2Variations/eosc-life/docs/RD-Connect_pipeline.jpg) ## The applications **1\\. Name of the application: Adaptor removal** Function: remove sequencing adaptors Container (readiness status, location, version): [cutadapt (v.1.18)](https://hub.docker.com/r/cnag/cutadapt) Required resources in cores and RAM: current container size 169MB Input data (amount, format, directory..): raw fastq Output data: paired fastq without adaptors **2\\. Name of the application: Mapping and bam sorting** Function: align data to reference genome Container : [bwa-mem (v.0.7.17)](https://hub.docker.com/r/cnag/bwa) / [Sambamba (v. 0.6.8 )](https://hub.docker.com/r/cnag/sambamba)(or samtools) Resources :current container size 111MB / 32MB Input data: paired fastq without adaptors Output data: sorted bam **3\\. Name of the application: MarkDuplicates** Function: Mark (and remove) duplicates Container: [Picard (v.2.18.25)](https://hub.docker.com/r/cnag/picard) Resources: current container size 261MB Input data:sorted bam Output data: Sorted bam with marked (or removed) duplicates **4\\. Name of the application: Base quality recalibration (BQSR)** Function: Base quality recalibration Container: [GATK (v.3.6-0)](https://hub.docker.com/r/cnag/gatk) Resources: current container size 270MB Input data: Sorted bam with marked (or removed) duplicates Output data: Sorted bam with marked duplicates & base quality recalculated **5\\. Name of the application: Variant calling** Function: variant calling Container: [GATK (v.3.6-0)](https://hub.docker.com/r/cnag/gatk) Resources: current container size 270MB Input data:Sorted bam with marked duplicates & base quality recalculated Output data: unannotated gvcf per sample **6\\. (OPTIONAL)Name of the application: Quality of the fastq** Function: report on the sequencing quality Container: [fastqc 0.11.8](https://hub.docker.com/r/cnag/fastqc) Resources: current container size 173MB Input data: raw fastq Output data: QC report ## Licensing GATK declares that archived packages are made available for free to academic researchers under a limited license for non-commercial use. If you need to use one of these packages for commercial use. https://software.broadinstitute.org/gatk/download/archive """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.107.1" ; ns1:image ; ns1:input , , , , , , , ; ns1:keywords "CWL, variant_calling" ; ns1:license ; ns1:name "VariantCaller_GATK3.6" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/AF Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/DP Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/DP_ALT Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Number of Clusters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Variation data to report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/gene products translations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/af_filter_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/af_recalculated" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/all_variants_all_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/by_variant_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/collapsed_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/combined_variant_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/dp_alt_filter_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/dp_filter_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_and_renamed_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_extracted_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/highest_impact_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/plot_number_of_clusters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/prefiltered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variant_frequency_plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_for_plotting" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-10-07T15:31:56Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:02:10Z"^^ns1:Date ; ns1:description "This workflow takes a VCF dataset of variants produced by any of the *-variant-calling workflows in https://github.com/galaxyproject/iwc/tree/main/workflows/sars-cov-2-variant-calling and generates tabular lists of variants by Samples and by Variant, and an overview plot of variants and their allele-frequencies." ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "sars-cov-2-variation-reporting/COVID-19-VARIATION-REPORTING" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 9 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T13:30:37Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:25Z"^^ns1:Date ; ns1:description "This workflow employs a recombination detection algorithm (GARD) developed by Kosakovsky Pond et al. and implemented in the hyphy package. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - Recombination and selection analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "" . a ns1:Person ; ns1:name "Niko Beerenwinkel" . a ns1:Person ; ns1:name "Susana Posada Céspedes" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ARTIC primer BED" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ARTIC primers to amplicon assignments" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Minimum DP required after amplicon bias correction" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Minimum DP_ALT required after amplicon bias correction" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NC_045512.2 FASTA sequence of SARS-CoV-2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Paired Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Read removal maximum AF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Read removal minimum AF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/amplicon_removal_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_softfiltered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_html_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_raw_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_raw_output_flattened" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_json_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_reads_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_preliminary_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/lost_filter_passing_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preliminary_variants_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preliminary_variants_1_filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preliminary_variants_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preliminary_variants_2_filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_plots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/primer_trimmed_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_primer_trimmed_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_primer_trimmed_filtered_mapped_reads_with_indel_quals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_fixed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_fixed_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_fixed_partial" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-03-26T09:51:25Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:01:51Z"^^ns1:Date ; ns1:description """COVID-19: variation analysis on ARTIC PE data --------------------------------------------- The workflow for Illumina-sequenced ampliconic data builds on the RNASeq workflow for paired-end data using the same steps for mapping and variant calling, but adds extra logic for trimming amplicon primer sequences off reads with the ivar package. In addition, this workflow uses ivar also to identify amplicons affected by primer-binding site mutations and, if possible, excludes reads derived from such "tainted" amplicons when calculating allele-frequencies of other variants. """ ; ns1:input , , , , , , , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "covid-19, ARTIC, covid19.galaxyproject.org, BY-COVID" ; ns1:license ; ns1:name "sars-cov-2-pe-illumina-artic-variant-calling/COVID-19-PE-ARTIC-ILLUMINA" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 11 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Maximum read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Minimum read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NC_045512.2 FASTA sequence of SARS-CoV-2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ONT-sequenced reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Primer binding sites info in BED format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_softfiltered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_html_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_raw_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_raw_output_flattened" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/length_filtered_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/lofreq_filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/medaka_consensus_data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/medaka_variants_general" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/medaka_variants_pbs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/minimap2_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/primer_trimmed_realigned_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_combined" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/variants_pbs" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-12-21T03:01:07Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:36Z"^^ns1:Date ; ns1:description """COVID-19: variation analysis on ARTIC ONT data ---------------------------------------------- This workflow for ONT-sequenced ARTIC data is modeled after the alignment/variant-calling steps of the [ARTIC pipeline](https://artic.readthedocs.io/en/latest/). It performs, essentially, the same steps as that pipeline’s minion command, i.e. read mapping with minimap2 and variant calling with medaka. Like the Illumina ARTIC workflow it uses ivar for primer trimming. Since ONT-sequenced reads have a much higher error rate than Illumina-sequenced reads and are therefor plagued more by false-positive variant calls, this workflow does make no attempt to handle amplicons affected by potential primer-binding site mutations. """ ; ns1:input , , , , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "covid-19, ONT, covid19.galaxyproject.org, ARTIC" ; ns1:license ; ns1:name "sars-cov-2-ont-artic-variant-calling/COVID-19-ARTIC-ONT" ; ns1:output , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NC_045512.2 FASTA sequence of SARS-CoV-2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Single End Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/called_variant" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessed_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_mapped_reads_with_indel_quals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/soft_filtered_variants" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-03-26T12:00:25Z"^^ns1:Date ; ns1:dateModified "2026-04-12T01:02:44Z"^^ns1:Date ; ns1:description "This workflows performs single end read mapping with bowtie2 followed by sensitive variant calling across a wide range of AFs with lofreq" ; ns1:input , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "sars-cov-2-se-illumina-wgs-variant-calling/COVID-19-SE-WGS-ILLUMINA" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NC_045512.2 FASTA sequence of SARS-CoV-2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Paired Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Map with BWA-MEM on input dataset(s) (mapped reads in BAM format)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/annotated_variants_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/called_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_pe" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_filtered_mapped_reads_with_indel_quals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/soft_filtered_variants" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-12-21T03:01:05Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:33Z"^^ns1:Date ; ns1:description """COVID-19: variation analysis on WGS PE data ------------------------------------------- This workflows performs paired end read mapping with bwa-mem followed by sensitive variant calling across a wide range of AFs with lofreq and variant annotation with snpEff 4.5covid19. """ ; ns1:input , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "covid-19, covid19.galaxyproject.org, emergen_validated, Virology" ; ns1:license ; ns1:name "sars-cov-2-pe-illumina-wgs-variant-calling/COVID-19-PE-WGS-ILLUMINA" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-03-26T12:58:59Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:49:39Z"^^ns1:Date ; ns1:description """A porting of the Trinity RNA assembly pipeline, https://trinityrnaseq.github.io, that uses Nextflow to handle the underlying sub-tasks. This enables additional capabilities to better use HPC resources, such as packing of tasks to fill up nodes and use of node-local disks to improve I/O. By design, the pipeline separates the workflow logic (main file) and the cluster-specific configuration (config files), improving portability. Based on a pipeline by Sydney Informatics Hub: https://github.com/Sydney-Informatics-Hub/SIH-Raijin-Trinity""" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "Assembly, Transcriptomics, RNASEQ, Nextflow" ; ns1:license ; ns1:name "Trinity RNA Assembly" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-03-21T18:12:10Z"^^ns1:Date ; ns1:dateModified "2023-07-03T10:16:04Z"^^ns1:Date ; ns1:description """Workflow for tracking objects in Cell Profiler: https://training.galaxyproject.org/training-material/topics/imaging/tutorials/object-tracking-using-cell-profiler/tutorial.html""" ; ns1:image ; ns1:keywords "CellProfiler, imaging, Galaxy, image processing" ; ns1:license ; ns1:name "Object tracking using CellProfiler" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_2021021_observateur.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_20210221_observation.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_20210221_organisme.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_20210221_secteur_geog.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_20210221_sortie.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BDD_Kakila_v2_20210221_taxon.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kakila_database_of_marine_mammal_observation_data_in_the_AGOA_sanctuary_-_French_Antilles.xml" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-04-09T15:45:27Z"^^ns1:Date ; ns1:dateModified "2023-11-09T21:04:38Z"^^ns1:Date ; ns1:description "Workflow to take DataOne data packages (raw datasets + metadata written in Ecological Metadata Standard) as input and create a DwC occurence.csv file almost ready to put in a Dawrin core Archive using eml-annotations at the attribute level" ; ns1:input , , , , , , ; ns1:keywords "DataOne, Data package, EML, Ecological metadata language, eml-annotation, Darwin core, Galaxy-E, Galaxy" ; ns1:license ; ns1:name "Workflow constructed from history 'test dwc from PNDB Data package EML DwC annotations'" ; ns1:output , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "data_matrix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gmt_filepath" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "index_col" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "outdir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samples_on_rows" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "separator" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "resdir" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-05-05T15:50:31Z"^^ns1:Date ; ns1:dateModified "2023-04-21T11:04:20Z"^^ns1:Date ; ns1:description """# COnSensus Interaction Network InFErence Service Inference framework for reconstructing networks using a consensus approach between multiple methods and data sources. ![alt text](https://github.com/PhosphorylatedRabbits/cosifer/raw/master/docs/_static/logo.png) ## Reference [Manica, Matteo, Charlotte, Bunne, Roland, Mathis, Joris, Cadow, Mehmet Eren, Ahsen, Gustavo A, Stolovitzky, and María Rodríguez, Martínez. "COSIFER: a python package for the consensus inference of molecular interaction networks".Bioinformatics (2020)](https://doi.org/10.1093/bioinformatics/btaa942).""" ; ns1:image ; ns1:input , , , , , ; ns1:keywords "cosifer, cancer, pediatric, rna-seq" ; ns1:license ; ns1:name "COSIFER" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-05-05T15:53:28Z"^^ns1:Date ; ns1:dateModified "2023-04-21T11:04:40Z"^^ns1:Date ; ns1:description """# COnSensus Interaction Network InFErence Service Inference framework for reconstructing networks using a consensus approach between multiple methods and data sources. ![alt text](https://raw.githubusercontent.com/PhosphorylatedRabbits/cosifer/master/docs/_static/logo.png) ## Reference [Manica, Matteo, Charlotte, Bunne, Roland, Mathis, Joris, Cadow, Mehmet Eren, Ahsen, Gustavo A, Stolovitzky, and María Rodríguez, Martínez. "COSIFER: a python package for the consensus inference of molecular interaction networks".Bioinformatics (2020)](https://doi.org/10.1093/bioinformatics/btaa942).""" ; ns1:keywords "cosifer, cancer, pediatric, rna-seq" ; ns1:license ; ns1:name "COSIFER" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2020-04-10T13:46:55Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:31Z"^^ns1:Date ; ns1:description " A version of V-pipe (analysis of next generation sequencing (NGS) data from viral pathogens) specifically adapted to analyze high-throughput sequencing data of SARS-CoV-2. " ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "(old) SARS-COV2 version of the V-Pipe workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T09:14:04Z"^^ns1:Date ; ns1:dateModified "2026-03-24T09:19:09Z"^^ns1:Date ; ns1:description """# Protein MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.120.8" ; ns1:isBasedOn ; ns1:isPartOf , , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 8 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS grompp configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS grompp configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS mdrun configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Editconf configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS grompp configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genion configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS grompp configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS make_ndx configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS grompp configuration dictionary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "whole workflow output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-05-20T14:41:19Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:49:52Z"^^ns1:Date ; ns1:description """CWL version of the md_list.py workflow for HPC. This performs a system setup and runs a molecular dynamics simulation on the structure passed to this workflow. This workflow uses the md\\_gather.cwl sub-workflow to gather the outputs together to return these. To work with more than one structure this workflow can be called from either the md\\_launch.cwl workflow, or the md\\_launch\\_mutate.cwl workflow. These use scatter for parallelising the workflow. md\\_launch.cwl operates on a list of individual input molecule files. md\\_launch\\_mutate.cwl operates on a single input molecule file, and a list of mutations to apply to that molecule. Within that list of mutations, a value of 'WT' will indicate that the molecule should be simulated without any mutation being applied. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.121.1" ; ns1:image ; ns1:input , , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Molecular Dynamics Simulation" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "endpoint" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "query_file" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-05-26T10:51:51Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:49:55Z"^^ns1:Date ; ns1:description "A workflow querying on an endpoint of a graph database by a file containing a SPARQL query." ; ns1:image ; ns1:input , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "SPARQL query (in a file) on graph database" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Temperature timeseries (csv)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "daily_barchart" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "daily_mean_timeseries" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "monthly_mean_timeseries" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stripes_daily_temperatures" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stripes_monthly_temperatures" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-05-23T19:28:06Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:50:01Z"^^ns1:Date ; ns1:description """This workflow is used to process timeseries from meteorological stations in Finland but can be applied to any timeseries according it follows the same format. Take a temperature timeseries from any meteorological station. Input format is csv and it must be standardized with 6 columns: 1. Year (ex: 2021) 2. month (ex: 1) 3. day (ex: 15) 4. Time (ex: 16:56) 5. Time zone (such as UTC) 6. Air temperature (degC)""" ; ns1:input ; ns1:keywords "Climate, eosc-nordic, observation" ; ns1:license ; ns1:name "Compute daily and monthly mean from meteorological station measurements" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_taxonomy_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken_taxonomy_table_modified" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Taxonomic_prediction_report" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-06-17T09:14:19Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "Metagenomic dataset taxonomic classification using kraken2" ; ns1:keywords "Virology, kraken" ; ns1:license ; ns1:name "1: Plant virus detection with kraken2 (SE)" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-06-21T08:33:47Z"^^ns1:Date ; ns1:dateModified "2024-06-12T09:57:49Z"^^ns1:Date ; ns1:description "Variant Interpretation Pipeline (VIP) that annotates, filters and reports prioritized causal variants in humans, see https://github.com/molgenis/vip for more information." ; ns1:keywords "Annotation, Report, VCF, Classification, SV, Pipeline, Bioinformatics, Genomics, Workflows, Java, SNPs, variation, Nextflow" ; ns1:license ; ns1:name "MOLGENIS/VIP: Variant Interpretation Pipeline" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Daniël Wijnbergen" . a ns1:Person ; ns1:name "Eleni Mina" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , ; ns1:dateCreated "2021-06-23T10:42:46Z"^^ns1:Date ; ns1:dateModified "2022-10-27T16:39:25Z"^^ns1:Date ; ns1:description """ Joint multi-omics dimensionality reduction approaches for CAKUT data using peptidome and proteome data **Brief description** In (Cantini et al. 2020), Cantini et al. evaluated 9 representative joint dimensionality reduction (jDR) methods for multi-omics integration and analysis and . The methods are Regularized Generalized Canonical Correlation Analysis (RGCCA), Multiple co-inertia analysis (MCIA), Multi-Omics Factor Analysis (MOFA), Multi-Study Factor Analysis (MSFA), iCluster, Integrative NMF (intNMF), Joint and Individual Variation Explained (JIVE), tensorial Independent Component Analysis (tICA), and matrix-tri-factorization (scikit-fusion) (Tenenhaus, Tenenhaus, and Groenen 2017; Bady et al. 2004; Argelaguet et al. 2018; De Vito et al. 2019; Shen, Olshen, and Ladanyi 2009; Chalise and Fridley 2017; Lock et al. 2013; Teschendorff et al. 2018; Žitnik and Zupan 2015). The authors provided their benchmarking procedure, multi-omics mix (momix), as Jupyter Notebook on GitHub (https://github.com/ComputationalSystemsBiology/momix-notebook) and project environment through Conda. In momix, the factorization methods are called from an R script, and parameters of the methods are also set in that script. We did not modify the parameters of the methods in the provided script. We set factor number to 2. """ ; ns1:keywords "rare diseases, workflow, Proteomics, protein, mirna prediction" ; ns1:license ; ns1:name "EJP-RD WP13 case-study CAKUT momix analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-03-24T15:15:39Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:18:08Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.127.8" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein-ligand Docking tutorial (Cluster90)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 8 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-03-24T15:18:40Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:20:24Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.128.7" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein-ligand Docking tutorial (PDBe REST API)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-03-24T15:22:14Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:24:30Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.129.7" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein-ligand Docking tutorial (Fpocket)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:17:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:34Z"^^ns1:Date ; ns1:description "This workflow is used form the preparation of protein and ligands for docking. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - Enumerate ligands for docking" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mpro-x0195_0_apo-desolv_pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hits_frankenstein_17_sdf" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T10:00:32Z"^^ns1:Date ; ns1:dateModified "2026-03-26T10:02:19Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.130.7" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Amber Protein MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T10:33:46Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:11:40Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.131.7" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Amber Protein Ligand Complex MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:41:41Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:43:35Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.132.7" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Amber Constant pH MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2021-07-08T14:18:03Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:50:20Z"^^ns1:Date ; ns1:description """ORSON combine state-of-the-art tools for annotation processes within a Nextflow pipeline: sequence similarity search (PLAST, BLAST or Diamond), functional annotation retrieval (BeeDeeM) and functional prediction (InterProScan). When required, BUSCO completness evaluation and eggNOG Orthogroup annotation can be activated. While ORSON results can be analyzed through the command-line, it also offers the possibility to be compatible with BlastViewer or Blast2GO graphical tools. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.136.1" ; ns1:image ; ns1:keywords "Annotation, Transcriptomics, Genomics, Proteomics, Nextflow" ; ns1:license ; ns1:name "ORSON: workflow for prOteome and tRanScriptome functiOnal aNnotation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "IWC" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Run accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Paired End Reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Single End Reads" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-06-11T02:01:57Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:01:51Z"^^ns1:Date ; ns1:description "Downloads fastq files for sequencing run accessions provided in a text file using fasterq-dump. Creates one job per listed run accession." ; ns1:input ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "parallel-accession-download/main" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 9 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Depth-threshold for masking" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Variant calls" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/aligned reads data for depth calculation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/min-AF for consensus variant" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/min-AF for failed variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/1_based_masking_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/called_variant_sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/chrom_pos_ref_called_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/chrom_pos_ref_failed_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/consensus_af_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/consensus_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/coverage_depth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/depth_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/failed_variant_sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filter_failed_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/low_cov_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/low_cov_regions_plus_filter_failed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/low_cov_regions_plus_filter_failed_combined" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/masking_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/multisample_consensus_fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/non_consensus_af_threshold" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-03-26T12:00:41Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:01:56Z"^^ns1:Date ; ns1:description """Build a consensus sequence from FILTER PASS variants with intrasample allele-frequency above a configurable consensus threshold. Hard-mask regions with low coverage (but not consensus variants within them) and ambiguous sites.""" ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "sars-cov-2-consensus-from-variation/COVID-19-CONSENSUS-CONSTRUCTION" ; ns1:output , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 9 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:25:46Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:43Z"^^ns1:Date ; ns1:description "This workflow generates a file describing the active site of the protein for each of the fragment screening crystal structures using rDock s rbcavity. It also creates a single hybrid molecule that contains all the ligands - the \"frankenstein\" ligand. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - Active site generation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2_Input Dataset Collection" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name " Brian M Forde" . a ns1:Person ; ns1:name "Adam D Irwin" . a ns1:Person ; ns1:name "David L Paterson" . a ns1:Person ; ns1:name "David M Whiley" . a ns1:Person ; ns1:name "Leah W Roberts" . a ns1:Person ; ns1:name "Mark A Schembri" . a ns1:Person ; ns1:name "Minh-Duy Phan" . a ns1:Person ; ns1:name "Nguyen Thi Khanh Nhu" . a ns1:Person ; ns1:name "Patrick N A Harris" . a ns1:Person ; ns1:name "Scott A Beatson " . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , ; ns1:dateCreated "2021-08-09T00:17:36Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:29Z"^^ns1:Date ; ns1:description """microPIPE was developed to automate high-quality complete bacterial genome assembly using Oxford Nanopore Sequencing in combination with Illumina sequencing. To build microPIPE we evaluated the performance of several tools at each step of bacterial genome assembly, including basecalling, assembly, and polishing. Results at each step were validated using the high-quality ST131 Escherichia coli strain EC958 (GenBank: HG941718.1). After appraisal of each step, we selected the best combination of tools to achieve the most consistent and best quality bacterial genome assemblies. The workflow below summarises the different steps of the pipeline (with each selected tool) and the approximate run time (using GPU basecalling, averaged over 12 E. coli isolates sequenced on a R9.4 MinION flow cell). Dashed boxes correspond to optional steps in the pipeline. Micropipe has been written in Nextflow and uses Singularity containers. It can use both GPU and CPU resources. For more information please see our publication here: https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-021-07767-z Infrastructure\\_deployment\\_metadata: Zeus (Pawsey)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.140.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "ONT, bacterial-genomics, Assembly, Nextflow, workflow" ; ns1:license ; ns1:name "microPIPE: a pipeline for high-quality bacterial genome construction using ONT and Illumina sequencing" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-08-09T20:25:49Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:30Z"^^ns1:Date ; ns1:description """This repository contains the workflow used to find and characterize the HI sources in the data cube of the SKA Data Challenge 2. It was developed to process a simulated [SKA data cube](https://sdc2.astronomers.skatelescope.org/sdc2-challenge/data) data cube, but can be adapted for clean HI data cubes from other radio observatories. The workflow is managed and executed using snakemake workflow management system. It uses [https://spectral-cube.readthedocs.io/en/latest/](http://) based on [https://dask.org/](http://) parallelization tool and [https://www.astropy.org/](http://) suite to divide the large cube in smaller pieces. On each of the subcubes, we execute [https://github.com/SoFiA-Admin/SoFiA-2](http://) for masking the subcubes, find sources and characterize their properties. Finally, the individual catalogs are cleaned, concatenated into a single catalog, and duplicates from the overlapping regions are eliminated. Some diagnostic plots are produced using Jupyter notebook. The documentation can be found in the [Documentation page](https://hi-friends-sdc2.readthedocs.io/en/latest/index.html). The workflow and the results can be cited in the [Zenodo record](https://doi.org/10.5281/zenodo.5167659).""" ; ns1:image ; ns1:keywords "SKA, radio interferometry" ; ns1:license ; ns1:name "HI-FRIENDS HI data cube source finding and characterization" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-08-11T04:12:04Z"^^ns1:Date ; ns1:dateModified "2024-04-17T04:16:05Z"^^ns1:Date ; ns1:description "This is a Galaxy workflow that uses to convert the16S BIOM file to table and figures. It is part of the metaDEGalaxy workflow MetaDEGalaxy: Galaxy workflow for differential abundance analysis of 16s metagenomic data. " ; ns1:image ; ns1:isPartOf ; ns1:keywords "MetaDEGalaxy" ; ns1:license ; ns1:name "16S_biodiversity_BIOM" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2021-08-17T04:35:21Z"^^ns1:Date ; ns1:dateModified "2025-07-25T02:04:55Z"^^ns1:Date ; ns1:description """Germline-ShortV @ NCI-Gadi is an implementation of the BROAD Institute's best practice workflow for germline short variant discovery. This implementation is optimised for the National Compute Infrastucture's Gadi HPC, utilising scatter-gather parallelism to enable use of multiple nodes with high CPU or memory efficiency. This workflow requires sample BAM files, which can be generated using the [Fastq-to-bam @ NCI-Gadi](https://workflowhub.eu/workflows/146) pipeline. Germline-ShortV can be applied to model and non-model organisms (including non-diploid organisms). Infrastructure\\_deployment\\_metadata: Gadi (NCI)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.143.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "GATK4, variant_calling, WGS, SNPs, INDELs, HaplotyeCaller, Germline, BROAD, Genomics, genome, DNA, DNA-seq" ; ns1:license ; ns1:name "Germline-ShortV @ NCI-Gadi" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-08-17T04:42:40Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:44Z"^^ns1:Date ; ns1:description """# SLURM HPC Cromwell implementation of GATK4 germline variant calling pipeline See the [GATK](https://gatk.broadinstitute.org/hc/en-us) website for more information on this toolset ## Assumptions - Using hg38 human reference genome build - Running using HPC/SLURM scheduling. This repo was specifically tested on Pawsey Zeus machine, primarily running in the `/scratch` partition. - Starting from short-read Illumina paired-end fastq files as input ### Dependencies The following versions have been tested and work, but GATK and Cromwell are regularly updated and so one must consider whether they would like to use newer versions of these tools. - BWA/0.7.15 - GATK v4.0.6.0 - SAMtools/1.5 - picard/2.9 - Python/2.7 - Cromwell v61 ## Quick start guide ### Installing and preparing environment for GATK4 with Cromwell 1. Clone repository ``` git clone https://github.com/SarahBeecroft/slurmCromwellGATK4.git cd slurmCromwellGATK4 chmod +x *.sh ``` 2. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) if you haven’t already. This is best placed in your `/group` directory to avoid filling your small `/home` directory, or being purged is placed in the `/scratch` directory. 3. Create Conda environment using the supplied conda environment file ``` conda env create --file gatk4_pipeline.yml ``` 3. Download the necessary .jar files - The Cromwell workfow orchestration engine can be downloaded from https://github.com/broadinstitute/cromwell/releases/ - GATK can be downloaded from https://github.com/broadinstitute/gatk/releases. Unzip the file with `unzip` - Picard can be downloaded from https://github.com/broadinstitute/picard/releases/ 4. If you do not have the resource bundle files already, these need to be downloaded. In future they will be cached on Pawsey systems. The bundle data should be download from the [Google Cloud bucket](https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0;tab=objects?_ga=2.98248159.1769807612.1582055494-233304531.1578854612&pli=1&prefix=&forceOnObjectsSortingFiltering=false) and not from the FTP site, which is missing various files. Refer to this handy [blog post](https://davetang.org/muse/2020/02/21/using-google-cloud-sdk-to-download-gatk-resource-bundle-files/) on how to download the resource files using Google Cloud SDK. There is a Slurm script (download_bundle.slurm) that can be used to download all hg38 files from the Google Cloud bucket. The files were downloaded in /scratch/pawsey0001/sbeecroft/hg38/v0, which needs to be moved before the data becomes purged after 30 days. Note that Homo_sapiens_assembly38.dbsnp138.vcf.gz was from the FTP bundle as this file could not be downloaded using the Conda version of Google Cloud SDK. Note that the `hg38_wgs_scattered_calling_intervals.txt` will need to be to generated using the following: ``` cd find `pwd` -name "scattered.interval_list" -print | sort > hg38_wgs_scattered_calling_intervals.txt ``` These files are required for Multisample_Fastq_to_Gvcf_GATK4. ``` Homo_sapiens_assembly38.dict Homo_sapiens_assembly38.fasta Homo_sapiens_assembly38.fasta.fai Homo_sapiens_assembly38.fasta.64.alt Homo_sapiens_assembly38.fasta.64.amb Homo_sapiens_assembly38.fasta.64.ann Homo_sapiens_assembly38.fasta.64.bwt Homo_sapiens_assembly38.fasta.64.pac Homo_sapiens_assembly38.fasta.64.sa Homo_sapiens_assembly38.fasta.amb Homo_sapiens_assembly38.fasta.ann Homo_sapiens_assembly38.fasta.bwt Homo_sapiens_assembly38.fasta.pac Homo_sapiens_assembly38.fasta.sa Homo_sapiens_assembly38.dbsnp138.vcf.gz (needs to be gunzipped) Homo_sapiens_assembly38.dbsnp138.vcf.idx Mills_and_1000G_gold_standard.indels.hg38.vcf.gz Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi Homo_sapiens_assembly38.dbsnp138.vcf Homo_sapiens_assembly38.dbsnp138.vcf.idx Homo_sapiens_assembly38.known_indels.vcf.gz Homo_sapiens_assembly38.known_indels.vcf.gz.tbi ``` These files are required for Multisample_jointgt_GATK4. ``` wgs_evaluation_regions.hg38.interval_list hg38.custom_100Mb.intervals Homo_sapiens_assembly38.dbsnp138.vcf Homo_sapiens_assembly38.dbsnp138.vcf.idx 1000G_phase1.snps.high_confidence.hg38.vcf.gz 1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi 1000G_omni2.5.hg38.vcf.gz 1000G_omni2.5.hg38.vcf.gz.tbi Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi hapmap_3.3.hg38.vcf.gz hapmap_3.3.hg38.vcf.gz.tbi ``` 5. Set up the config files. Files that you need to edit with the correct paths to your data/jar files or other specific configurations are: - `Multisample_Fastq_to_Gvcf_GATK4_inputs_hg38.json` - `Multisample_jointgt_GATK4_inputs_hg38.json` - both json files will need the correct paths to your reference file locations, and the file specifying your inputs i.e. `samples.txt` or `gvcfs.txt` - `samples.txt` - `gvcfs.txt` - These are the sample input files (tab seperated) - The format for samples.txt is sampleID, sampleID_readgroup, path_to_fastq_R1_file, path_to_fastq_R2_file, - The format for gvcfs.txt is sample ID, gvcf, gvcf .tbi index file - Examples are included in this repo - NOTE: Having tabs, not spaces, is vital for parsing the file. Visual studio code tends to introduce spaces, so if you are having issues, check the file with another text editor such as sublime. - `launch_cromwell.sh` - `launch_jointgt.sh` - These are the scripts which launch the pipeline. - `launch_cromwell.sh` launches the fastq to gvcf stage - `launch_jointgt.sh` launched the gvcf joint genotyping to cohort vcf step. This is perfomed when you have run all samples through the fastq to gvcf stage. - Check the paths and parameters make sense for your machine - `slurm.conf` - the main options here relate to the job scheduler. If you are running on Zeus at Pawsey, you should not need to alter these parameters. - `cromwell.options` - `cromwell.options` requires editing to provide the directory where you would like the final workflow outputs to be written - `Multisample_Fastq_to_Gvcf_GATK4.wdl` - `ruddle_fastq_to_gvcf_single_sample_gatk4.wdl` - The paths to your jar files will need to be updated - The path to your conda `activate` binary will need to be updated (e.g. `/group/projectID/userID/miniconda/bin/activate`) 6. Launch the job using `sbatch launch_cromwell.sh`. When that has completed successfully, you can launch the second stage of the pipeline (joint calling) with `sbatch launch_jointgt.sh`. ### Overview of the steps in `Multisample_Fastq_to_Gvcf_GATK4.wdl` This part of the pipeline takes short-read, Illumina paired-end fastq files as the input. The outputs generated are sorted, duplicate marked bam files and their indices, duplicate metric information, and a GVCF file for each sample. The GVCF files are used as input for the second part of the pipeline (joint genotyping). ``` FastqToUbam GetBwaVersion SamToFastqAndBwaMem MergeBamAlignment SortAndFixTags MarkDuplicates CreateSequenceGroupingTSV BaseRecalibrator GatherBqsrReports ApplyBQSR GatherBamFiles HaplotypeCaller MergeGVCFs ``` ### Overview of the steps in `Multisample_jointgt_GATK4.wdl` This part of the pipeline takes GVCF files (one per sample), and performs joint genotyping across all of the provided samples. This means that old previously generated GVCFs can be joint-called with new GVCFs whenever you need to add new samples. The key output from this is a joint-genotyped, cohort-wide VCF file. ``` GetNumberOfSamples ImportGVCFs GenotypeGVCFs HardFilterAndMakeSitesOnlyVcf IndelsVariantRecalibrator SNPsVariantRecalibratorCreateModel SNPsVariantRecalibrator GatherTranches ApplyRecalibration GatherVcfs CollectVariantCallingMetrics GatherMetrics DynamicallyCombineIntervals ``` """ ; ns1:isPartOf ; ns1:keywords "GATK4, Genomics, Alignment, variant_calling, SNPs, INDELs" ; ns1:license ; ns1:name "GATK4 Fastq to joint-called cohort VCF with Cromwell on SLURM" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrey Bliznyuk" . a ns1:Person ; ns1:name "Ben Evans" . a ns1:Person ; ns1:name "Ben Menadue" . a ns1:Person ; ns1:name "Matthew Downton" . a ns1:Person ; ns1:name "Rika Kobayashi" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2021-08-17T04:44:57Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:44Z"^^ns1:Date ; ns1:description """Description: Trinity @ NCI-Gadi contains a staged [Trinity](https://github.com/trinityrnaseq/trinityrnaseq/wiki) workflow that can be run on the National Computational Infrastructure’s (NCI) Gadi supercomputer. Trinity performs de novo transcriptome assembly of RNA-seq data by combining three independent software modules Inchworm, Chrysalis and Butterfly to process RNA-seq reads. The algorithm can detect isoforms, handle paired-end reads, multiple insert sizes and strandedness. Infrastructure\\_deployment\\_metadata: Gadi (NCI)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.145.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "Assembly, Transcriptomics, trinity, NCI, RNASEQ, rna, rna-seq, Gadi, scalable, PBS" ; ns1:license ; ns1:name "Trinity @ NCI-Gadi" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrey Bliznyuk" . a ns1:Person ; ns1:name "Ben Menadue" . a ns1:Person ; ns1:name "Matthew Downton" . a ns1:Person ; ns1:name "Rika Kobayashi" . a ns1:Person ; ns1:name "Yue Sun" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , ; ns1:dateCreated "2021-08-17T04:45:57Z"^^ns1:Date ; ns1:dateModified "2025-07-25T01:52:31Z"^^ns1:Date ; ns1:description """Fastq-to-BAM @ NCI-Gadi is a genome alignment workflow that takes raw FASTQ files, aligns them to a reference genome and outputs analysis ready BAM files. This workflow is designed for the National Computational Infrastructure's (NCI) Gadi supercompter, leveraging multiple nodes on NCI Gadi to run all stages of the workflow in parallel, either massively parallel using the scatter-gather approach or parallel by sample. It consists of a number of stages and follows the BROAD Institute's best practice recommendations. Infrastructure\\_deployment\\_metadata: Gadi (NCI)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.146.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "Genomics, Alignment, BROAD, WGS, BWA-mem, scalable, NCI, Gadi, PBS, genome, DNA, mapping" ; ns1:license ; ns1:name "Fastq-to-bam @ NCI-Gadi" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-08-17T04:47:53Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:45Z"^^ns1:Date ; ns1:description """# Local Cromwell implementation of GATK4 germline variant calling pipeline See the [GATK](https://gatk.broadinstitute.org/hc/en-us) website for more information on this toolset ## Assumptions - Using hg38 human reference genome build - Running 'locally' i.e. not using HPC/SLURM scheduling, or containers. This repo was specifically tested on Pawsey Nimbus 16 CPU, 64GB RAM virtual machine, primarily running in the `/data` volume storage partition. - Starting from short-read Illumina paired-end fastq files as input ### Dependencies The following versions have been tested and work, but GATK and Cromwell are regularly updated and so one must consider whether they would like to use newer versions of these tools. - BWA/0.7.15 - GATK v4.0.6.0 - SAMtools/1.5 - picard/2.9 - Python/2.7 - Cromwell v61 ## Quick start guide ### Installing and preparing environment for GATK4 with Cromwell 1. Clone repository ``` git clone https://github.com/SarahBeecroft/cromwellGATK4.git cd cromwellGATK4 chmod 777 *.sh ``` 2. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) if you haven’t already. Create Conda environment using the supplied conda environment file ``` conda env create --file gatk4_pipeline.yml ``` 3. Download the necessary .jar files - The Cromwell workfow orchestration engine can be downloaded from https://github.com/broadinstitute/cromwell/releases/ - GATK can be downloaded from https://github.com/broadinstitute/gatk/releases. Unzip the file with `unzip` - Picard can be downloaded from https://github.com/broadinstitute/picard/releases/ 4. Upload the resource bundle file from IRDS using rclone or filezilla and unpack it with `tar xzvf resource.tar.gz`. Note that the `hg38_wgs_scattered_calling_intervals.txt` will need to be to generated using the following: ``` cd find `pwd` -name "scattered.interval_list" -print | sort > hg38_wgs_scattered_calling_intervals.txt ``` 5. Set up the config files. Files that you need to edit with the correct paths to your data/jar files or other specific configurations are: - `Multisample_Fastq_to_Gvcf_GATK4_inputs_hg38.json` - `Multisample_jointgt_GATK4_inputs_hg38.json` - both json files will need the correct paths to your reference file locations, and the file specifying your inputs i.e. `samples.txt` or `gvcfs.txt` - `samples.txt` - `gvcfs.txt` - These are the sample input files (tab seperated) - The format for samples.txt is sampleID, sampleID_readgroup, path_to_fastq_R1_file, path_to_fastq_R2_file, - The format for gvcfs.txt is sample ID, gvcf, gvcf .tbi index file - Examples are included in this repo - NOTE: Having tabs, not spaces, is vital for parsing the file. Visual studio code tends to introduce spaces, so if you are having issues, check the file with another text editor such as sublime. - `launch_cromwell.sh` - `launch_jointgt.sh` - These are the scripts which launch the pipeline. - `launch_cromwell.sh` launches the fastq to gvcf stage - `launch_jointgt.sh` launched the gvcf joint genotyping to cohort vcf step. This is perfomed when you have run all samples through the fastq to gvcf stage. - Check the paths and parameters make sense for your machine - `local.conf` - the main tuneable parameters here are: - `concurrent-job-limit = 5` this is the max number of concurrent jobs that can be spawned by cromwell. This depends on the computational resources available to you. 5 was determined to work reasonably well on a 16 CPU, 64GB RAM Nimbus VM (Pawsey). - `call-caching enabled = true`. Setting this parameter to `false` will disable call caching (i.e. being able to resume if the job fails before completion). By default, call caching is enabled. - `cromwell.options` - `cromwell.options` requires editing to provide the directory where you would like the final workflow outputs to be written - `Multisample_Fastq_to_Gvcf_GATK4.wdl` - `ruddle_fastq_to_gvcf_single_sample_gatk4.wdl` - The paths to your jar files will need to be updated - The path to your conda `activate` binary will need to be updated (e.g. `/data/miniconda/bin/activate`) 6. Launch the job within a `screen` or `tmux` session, using `./launch_cromwell.sh`. When that has completed successfully, you can launch the second stage of the pipeline (joint calling) with `./launch_jointgt.sh`. Ensure you pipe the stdout and stderr to a log file using (for example) `./launch_cromwell.sh &> cromwell.log` ### Overview of the steps in `Multisample_Fastq_to_Gvcf_GATK4.wdl` This part of the pipeline takes short-read, Illumina paired-end fastq files as the input. The outputs generated are sorted, duplicate marked bam files and their indices, duplicate metric information, and a GVCF file for each sample. The GVCF files are used as input for the second part of the pipeline (joint genotyping). ``` FastqToUbam GetBwaVersion SamToFastqAndBwaMem MergeBamAlignment SortAndFixTags MarkDuplicates CreateSequenceGroupingTSV BaseRecalibrator GatherBqsrReports ApplyBQSR GatherBamFiles HaplotypeCaller MergeGVCFs ``` ### Overview of the steps in `Multisample_jointgt_GATK4.wdl` This part of the pipeline takes GVCF files (one per sample), and performs joint genotyping across all of the provided samples. This means that old previously generated GVCFs can be joint-called with new GVCFs whenever you need to add new samples. The key output from this is a joint-genotyped, cohort-wide VCF file. This file can be used for a GEMINI database after normalisation with VT and annotation with a tool such as VEP or SNPEFF. The file `hg38.custom_100Mb.intervals` is required for this step of the pipeline to run. This is included in the git repo for convenience, but should be moved to your resource directory with all the other resource files. ``` GetNumberOfSamples ImportGVCFs GenotypeGVCFs HardFilterAndMakeSitesOnlyVcf IndelsVariantRecalibrator SNPsVariantRecalibratorCreateModel SNPsVariantRecalibrator GatherTranches ApplyRecalibration GatherVcfs CollectVariantCallingMetrics GatherMetrics DynamicallyCombineIntervals ``` """ ; ns1:isPartOf ; ns1:keywords "Alignment, GATK4, Genomics, variant_calling, SNPs, INDELs, workflow" ; ns1:license ; ns1:name "GATK4 Fastq to joint-called cohort VCF with Cromwell on local cluster (no job scheduler)" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2021-08-18T23:14:55Z"^^ns1:Date ; ns1:dateModified "2025-07-25T02:06:02Z"^^ns1:Date ; ns1:description """Somatic-ShortV @ NCI-Gadi is a variant calling pipeline that calls somatic short variants (SNPs and indels) from tumour and matched normal BAM files following [GATK's Best Practice Workflow](https://gatk.broadinstitute.org/hc/en-us/articles/360035894731-Somatic-short-variant-discovery-SNVs-Indels-). This workflow is designed for the National Computational Infrastructure's (NCI) Gadi supercompter, leveraging multiple nodes on NCI Gadi to run all stages of the workflow in parallel. Infrastructure\\_deployment\\_metadata: Gadi (NCI)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.148.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "GATK4, SNPs, INDELs, Somatic, variant_calling, Mutect2, NCI, NCI Gadi, Gadi, cancer, tumour, NCI-Gadi, scalable, VCF" ; ns1:license ; ns1:name "Somatic-ShortV @ NCI-Gadi" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Cali Willet" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2021-08-18T23:17:42Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:45Z"^^ns1:Date ; ns1:description """Flashlite-Trinity contains two workflows that run Trinity on the [University of Queensland's HPC, Flashlite](https://rcc.uq.edu.au/flashlite). Trinity performs de novo transcriptome assembly of RNA-seq data by combining three independent software modules Inchworm, Chrysalis and Butterfly to process RNA-seq reads. The algorithm can detect isoforms, handle paired-end reads, multiple insert sizes and strandedness. Users can run Flashlite-Trinity on single samples, or smaller samples requiring <500Gb of memory or staged Trinity which is recommended for global assemblies with multiple sample inputs. Both implementations make use of Singularity containers to install software. Infrastructure\\_deployment\\_metadata: FlashLite (QRISCloud)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.149.1" ; ns1:isPartOf ; ns1:keywords "trinity, Transcriptomics, Assembly, illumina, salmon, scalable, global assemblies, rna-seq, de novo, transcriptome, strandedness, rna, singularity, container, PBS" ; ns1:license ; ns1:name "Flashlite-Trinity" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:32:01Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:50Z"^^ns1:Date ; ns1:description "Docking performed by rDock using as 3 different kind of inputs. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - Docking" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name "Nathaniel Butterworth" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2021-08-18T23:19:40Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:45Z"^^ns1:Date ; ns1:description """Flashlite-Juicer is a PBS implementation of [Juicer](https://github.com/aidenlab/juicer) for University of Queensland's Flashlite HPC. Infrastructure\\_deployment\\_metadata: FlashLite (QRISCloud)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.150.1" ; ns1:isPartOf ; ns1:keywords "Juicer, Flashlite, Hi-C, PBS, TAD, scalable, map, FASTQ, BWA, topologically associating domains" ; ns1:license ; ns1:name "Flashlite-Juicer" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-08-18T23:21:08Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:45Z"^^ns1:Date ; ns1:description """The Flashlite-Supernova pipeline runs Supernova to generate phased whole-genome de novo assemblies from a Chromium prepared library on [University of Queensland's HPC, Flashlite](https://rcc.uq.edu.au/flashlite). Infrastructure\\_deployment\\_metadata: FlashLite (QRISCloud)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.151.1" ; ns1:isPartOf ; ns1:keywords "Flashlite, Supernova, 10X, TELLSeq" ; ns1:license ; ns1:name "Flashlite-Supernova" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Cali Willet" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2021-08-18T23:24:08Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:51:46Z"^^ns1:Date ; ns1:description """RNASeq-DE @ NCI-Gadi processes RNA sequencing data (single, paired and/or multiplexed) for differential expression (raw FASTQ to counts). This pipeline consists of multiple stages and is designed for the National Computational Infrastructure's (NCI) Gadi supercompter, leveraging multiple nodes to run each stage in parallel. Infrastructure\\_deployment\\_metadata: Gadi (NCI)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.152.1" ; ns1:isPartOf ; ns1:keywords "RNASEQ, differential_expression, DE, Gadi, NCI, illumina, STAR, SAMTools, RSeQC, HTSeq, MultiQC, FastQC, BBduk, rna, expression, differential expression, FASTQ, counts, NCI-Gadi, rna-seq, workflow, bash, PBS, parallel, scalable" ; ns1:license ; ns1:name "RNASeq-DE @ NCI-Gadi" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-08-18T23:26:00Z"^^ns1:Date ; ns1:dateModified "2025-07-25T02:02:50Z"^^ns1:Date ; ns1:description """Bootstrapping-for-BQSR @ NCI-Gadi is a pipeline for bootstrapping a variant resource to enable GATK base quality score recalibration (BQSR) for non-model organisms that lack a publicly available variant resource. This implementation is optimised for the National Compute Infrastucture's Gadi HPC. Multiple rounds of bootstrapping can be performed. Users can use [Fastq-to-bam @ NCI-Gadi](https://workflowhub.eu/workflows/146) and [Germline-ShortV @ NCI-Gadi](https://workflowhub.eu/workflows/143) to produce required input files for Bootstrapping-for-BQSR @ NCI-Gadi. Infrastructure\\_deployment\\_metadata: Gadi (NCI) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.153.1" ; ns1:isPartOf ; ns1:keywords "GATK4, Bootstrapping, BQSR, SNPs, INDELs, illumina, WGS, Genomics, Alignment, variant calling, model, non-model, scalable, DNA, NCI, NCI-Gadi, PBS" ; ns1:license ; ns1:name "Bootstrapping-for-BQSR @ NCI-Gadi" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward primer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Subfragment name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Metadata file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Primers are removed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse primer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sample name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_fastqc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_ngtax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_phyloseq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_picrust2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "turtle" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-09-30T11:15:19Z"^^ns1:Date ; ns1:dateModified "2025-09-11T07:04:34Z"^^ns1:Date ; ns1:description """Workflow for quality assessment of paired reads and classification using NGTax 2.0 and functional annotation using picrust2.
In addition files are exported to their respective subfolders for easier data management in a later stage.

Steps: - Quality plots (FastQC) - NG-TAX 2 High-throughput Amplicon Analysis - PICRUSt 2 - Function prediction from marker gene sequences - Export module for ngtax""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.154.2" ; ns1:image ; ns1:input , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Amplicon, Classification, CWL" ; ns1:license ; ns1:name "Quality assessment, amplicon classification and functional prediction" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum quality score to call base" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired read collection for samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Primer BED" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read fraction to call variant" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference FASTA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Version of pangolin-data to use" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "all_samples_nextclade" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "all_samples_pangolin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamqc_report_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_multifasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ivar_consensus_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ivar_variants_tabular" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "primer_trimmed_bam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snpeff_annotated_vcf" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-03-10T03:01:54Z"^^ns1:Date ; ns1:dateModified "2026-03-10T03:01:54Z"^^ns1:Date ; ns1:description "Find and annotate variants in ampliconic SARS-CoV-2 Illumina sequencing data and classify samples with pangolin and nextclade" ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:keywords "Virology" ; ns1:license ; ns1:name "sars-cov-2-pe-illumina-artic-ivar-analysis/SARS-COV-2-ILLUMINA-AMPLICON-IVAR-PANGOLIN-NEXTCLADE" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:Person ; ns1:name "Laure Quintric" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2021-09-10T13:40:28Z"^^ns1:Date ; ns1:dateModified "2025-06-05T14:46:14Z"^^ns1:Date ; ns1:description """SAMBA is a FAIR scalable workflow integrating, into a unique tool, state-of-the-art bioinformatics and statistical methods to conduct reproducible eDNA analyses using Nextflow. SAMBA starts processing by verifying integrity of raw reads and metadata. Then all bioinformatics processing is done using commonly used procedure (QIIME 2 and DADA2) but adds new steps relying on dbOTU3 and microDecon to build high quality ASV count tables. Extended statistical analyses are also performed. Finally, SAMBA produces a full dynamic HTML report including resources used, commands executed, intermediate results, statistical analyses and figures. The SAMBA pipeline can run tasks across multiple compute infrastructures in a very portable manner. It comes with singularity containers making installation trivial and results highly reproducible.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.156.1" ; ns1:image ; ns1:keywords "Metabarcoding, Nextflow, 16S, 18S, eDNA" ; ns1:license ; ns1:name "SAMBA: Standardized and Automated MetaBarcoding Analyses workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:50:31Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:57Z"^^ns1:Date ; ns1:description "This workflow generates binding scores that correlate well with binding affinities using an additional tool SuCOS Max, developed at Oxford University. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - SuCOS scoring" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset Collection" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-14T13:52:01Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:52:18Z"^^ns1:Date ; ns1:description "Continuous flexibility analysis of SARS-CoV-2 Spike prefusion structures" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.160.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "Cryo electron microscopy of SARS-CoV-2 spike in prefusion state" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:51:59Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:04Z"^^ns1:Date ; ns1:description "This workflow generates binding scores that correlate well with binding affinities using an additional tool TransFS, developed at Oxford University. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - TransFS scoring" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T14:56:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:12Z"^^ns1:Date ; ns1:description "This workflow combines SDF files from all fragments into a single dataset and filters to include only the lowest (best) scoring pose for each compound. This file of optimal poses for all ligands is used to compare to a database of Enamine and Chemspace compounds to select the best scoring 500 matches. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - Filter results" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Frank von Delft" . a ns1:Person ; ns1:name "Gianmauro Cuccuru" . a ns1:Person ; ns1:name "Jack Scantlebury" . a ns1:Person ; ns1:name "Rachael Skyner" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Susan Leung" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "3_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-15T13:56:31Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:52:41Z"^^ns1:Date ; ns1:description "Cryo-EM processing workflow" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.183.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "testEntryTitleNew" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-15T15:26:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:52:45Z"^^ns1:Date ; ns1:description "Cryo-EM processing workflow" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.188.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "entryTitleTest" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input Gene List" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GO Enriched Terms" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GO Gene Prioritization" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GO Heatmap" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-24T15:16:59Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:52:49Z"^^ns1:Date ; ns1:description "BioTranslator performs sequentially pathway analysis and gene prioritization: A specific operator is executed for each task to translate the input gene set into semantic terms and pinpoint the pivotal-role genes on the derived semantic network. The output consists of the set of statistically significant semantic terms and the associated hub genes (the gene signature), prioritized according to their involvement in the underlying semantic topology." ; ns1:image ; ns1:input ; ns1:isBasedOn ; ns1:keywords "Semantic Network Analysis, Gene Prioritization, Pathway Analysis, Biomedical Ontologies, Semantic Interpretation" ; ns1:license ; ns1:name "BioTranslator Workflow" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2020-04-10T15:02:39Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:21Z"^^ns1:Date ; ns1:description "This workflow is used for the virtual screening of the SARS-CoV-2 main protease (de.NBI-cloud, STFC). It includes Charge enumeration, Generation of 3D conformations, Preparation of active site for docking using rDock, Docking, Scoring and Selection of compounds available. More info can be found at https://covid19.galaxyproject.org/cheminformatics/" ; ns1:image ; ns1:input , , , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Cheminformatics - XChem combined" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Sarai Varona and Miguel Juliá and Sara Monzon and Alexander Peltzer and Alison Meynert and Edgar Garriga Nogales and Erik Garrison and Gisela Gabernet and Harshil Patel and Joao Curado and Jose Espinosa-Carrasco and Katrin Sameith and Marta Pozuelo and Maxime Garcia and Michael Heuer and Phil Ewels and Simon Heumos and Stephen Kelly and Thanh Le Viet and Isabel Cuesta" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:Person ; ns1:name "Sophie Alain" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2021-09-24T14:22:51Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:52:59Z"^^ns1:Date ; ns1:description """ASPICov was developed to provide a rapid, reliable and complete analysis of NGS SARS-Cov2 samples to the biologist. This broad application tool allows to process samples from either capture or amplicon strategy and Illumina or Ion Torrent technology. To ensure FAIR data analysis, this Nextflow pipeline follows nf-core guidelines and use Singularity containers. Availability and Implementation: https://gitlab.com/vtilloy/aspicov Citation: Valentin Tilloy, Pierre Cuzin, Laura Leroi, Emilie Guérin, Patrick Durand, Sophie Alain ASPICov: An automated pipeline for identification of SARS-Cov2 nucleotidic variants PLoS One 2022 Jan 26;17(1):e0262953: https://pubmed.ncbi.nlm.nih.gov/35081137/""" ; ns1:image ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "ASPICov" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input sets of genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Distances Matrix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Heatmap" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-26T18:20:05Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:53:00Z"^^ns1:Date ; ns1:description "This workflow is based on the idea of comparing different gene sets through their semantic interpretation. In many cases, the user studies a specific phenotype (e.g. disease) by analyzing lists of genes resulting from different samples or patients. Their pathway analysis could result in different semantic networks, revealing mechanistic and phenotypic divergence between these gene sets. The workflow of BioTranslator Comparative Analysis compares quantitatively the outputs of pathway analysis, based on the topology of the underlying ontological graph, in order to derive a semantic similarity value for each pair of the initial gene sets. The workflow is available in a Galaxy application and can be used for 14 species. The algorithm accepts as input a batch of gene sets, such as BioTranslator, for the same organism. It performs pathway analysis according to the user-selected ontology and then it compares the derived semantic networks and extracts a matrix with their distances, as well as a respective heatmap." ; ns1:image ; ns1:input ; ns1:keywords "Semantic Network Analysis, Semantic Comparison, Pathway Analysis" ; ns1:license ; ns1:name "Workflow of BioTranslator Comparative Analysis" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "energy_min.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "energy_npt.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "energy_nvt.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myeditconf.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfix_side_chain.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_image.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_rgyr.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_trjconv_str.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygrompp.tpr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.cpt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.edr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rmsd_exp.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rmsd_first.xvg" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-26T19:50:02Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:53:08Z"^^ns1:Date ; ns1:description """Galaxy workflow example that illustrate the process of setting up a simulation system containing a protein, step by step, using the [BioExcel Building Blocks](/projects/11) library (biobb). The particular example used is the Lysozyme protein (PDB code 1AKI). This workflow returns a resulting protein structure and simulated 3D trajectories. Designed for running on the https://dev.usegalaxy.es Galaxy instance.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.194.1" ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Protein MD Setup tutorial using BioExcel Building Blocks (biobb) in Galaxy" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T14:00:17Z"^^ns1:Date ; ns1:dateModified "2026-03-26T14:03:37Z"^^ns1:Date ; ns1:description """# Structural DNA helical parameters from MD trajectory tutorial using BioExcel Building Blocks (biobb) **Based on the [NAFlex](https://mmb.irbbarcelona.org/NAFlex) server and in particular in its [Nucleic Acids Analysis section](https://mmb.irbbarcelona.org/NAFlex/help.php?id=tutorialAnalysisNA).** *** This tutorial aims to illustrate the process of **extracting structural and dynamical properties** from a **DNA MD trajectory helical parameters**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Drew Dickerson Dodecamer** sequence -CGCGAATTCGCG- (PDB code [1BNA](https://www.rcsb.org/structure/1BNA)). The trajectory used is a 500ns-long MD simulation taken from the [BigNASim](https://mmb.irbbarcelona.org/BIGNASim/) database ([NAFlex_DDD_II](https://mmb.irbbarcelona.org/BIGNASim/getStruc.php?idCode=NAFlex_DDD_II) entry). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.195.7" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Structural DNA helical parameters tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:46:01Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:48:52Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.196.7" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook ABC MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-09-29T08:25:31Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:53:17Z"^^ns1:Date ; ns1:description "Analysis of RNA-seq data starting from BAM and focusing on mRNA, lncRNA and miRNA" ; ns1:keywords "" ; ns1:license ; ns1:name "lncRNA" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-14T14:10:58Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:25Z"^^ns1:Date ; ns1:description """

nf-core/viralrecon

GitHub Actions CI Status GitHub Actions Linting Status Nextflow install with bioconda

Docker DOI

nfcore/viralrecon is a bioinformatics analysis pipeline used to perform assembly and intrahost/low-frequency variant calling for viral samples. The pipeline currently supports metagenomics and amplicon sequencing data derived from the Illumina sequencing platform.

This pipeline is a re-implementation of the SARS_Cov2_consensus-nf and SARS_Cov2_assembly-nf pipelines initially developed by Sarai Varona and Sara Monzon from BU-ISCIII. Porting both of these pipelines to nf-core was an international collaboration between numerous contributors and developers, led by Harshil Patel from the The Bioinformatics & Biostatistics Group at The Francis Crick Institute, London. We appreciated the need to have a portable, reproducible and scalable pipeline for the analysis of COVID-19 sequencing samples and so the Avengers Assembled! Please come and join us and add yourself to the contributor list :)

We have integrated a number of options in the pipeline to allow you to run specific aspects of the workflow if you so wish. For example, you can skip all of the assembly steps with the --skip_assembly parameter. See usage docs for all of the available options when running the pipeline.

Please click here to see an example MultiQC report generated using the parameters defined in this configuration file to run the pipeline on samples which were prepared from the ncov-2019 ARTIC Network V1 amplicon set and sequenced on the Illumina MiSeq platform in 301bp paired-end format.

The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. Furthermore, automated continuous integration tests to run the pipeline on a full-sized dataset are passing on AWS cloud.

Pipeline summary

  1. Download samples via SRA, ENA or GEO ids (ENA FTP, parallel-fastq-dump; if required)
  2. Merge re-sequenced FastQ files (cat; if required)
  3. Read QC (FastQC)
  4. Adapter trimming (fastp)
  5. Variant calling
    i. Read alignment (Bowtie 2)
    ii. Sort and index alignments (SAMtools)
    iii. Primer sequence removal (iVar; amplicon data only)
    iv. Duplicate read marking (picard; removal optional)
    v. Alignment-level QC (picard, SAMtools)
    vi. Choice of multiple variant calling and consensus sequence generation routes (VarScan 2, BCFTools, BEDTools || iVar variants and consensus || BCFTools, BEDTools)
    - Variant annotation (SnpEff, SnpSift)
    - Consensus assessment report (QUAST)
  6. De novo assembly
    i. Primer trimming (Cutadapt; amplicon data only)
    ii. Removal of host reads (Kraken 2)
    iii. Choice of multiple assembly tools (SPAdes || metaSPAdes || Unicycler || minia)
    - Blast to reference genome (blastn)
    - Contiguate assembly (ABACAS)
    - Assembly report (PlasmidID)
    - Assembly assessment report (QUAST)
    - Call variants relative to reference (Minimap2, seqwish, vg, Bandage)
    - Variant annotation (SnpEff, SnpSift)
  7. Present QC and visualisation for raw read, alignment, assembly and variant calling results (MultiQC)

Quick Start

i. Install nextflow

ii. Install either Docker or Singularity for full pipeline reproducibility (please only use Conda as a last resort; see docs)

iii. Download the pipeline and test it on a minimal dataset with a single command

nextflow run nf-core/viralrecon -profile test,<docker/singularity/conda/institute>

Please check nf-core/configs to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use -profile <institute> in your command. This will enable either docker or singularity and set the appropriate execution settings for your local compute environment.

iv. Start running your own analysis!

nextflow run nf-core/viralrecon -profile <docker/singularity/conda/institute> --input samplesheet.csv --genome 'NC_045512.2' -profile docker

See usage docs for all of the available options when running the pipeline.

Documentation

The nf-core/viralrecon pipeline comes with documentation about the pipeline, found in the docs/ directory:

  1. Installation
  2. Pipeline configuration
  3. Running the pipeline
  4. Output and how to interpret the results
  5. Troubleshooting

Credits

These scripts were originally written by Sarai Varona, Miguel Juliá and Sara Monzon from BU-ISCIII and co-ordinated by Isabel Cuesta for the Institute of Health Carlos III, Spain. Through collaboration with the nf-core community the pipeline has now been updated substantially to include additional processing steps, to standardise inputs/outputs and to improve pipeline reporting; implemented primarily by Harshil Patel from The Bioinformatics & Biostatistics Group at The Francis Crick Institute, London.

Many thanks to others who have helped out and contributed along the way too, including (but not limited to):

Name Affiliation Alexander Peltzer Boehringer Ingelheim, Germany Alison Meynert University of Edinburgh, Scotland Edgar Garriga Nogales Centre for Genomic Regulation, Spain Erik Garrison UCSC, USA Gisela Gabernet QBiC, University of Tübingen, Germany Joao Curado Flomics Biotech, Spain Jose Espinosa-Carrasco Centre for Genomic Regulation, Spain Katrin Sameith DRESDEN-concept Genome Center, Germany Lluc Cabus Flomics Biotech, Spain Marta Pozuelo Flomics Biotech, Spain Maxime Garcia SciLifeLab, Sweden Michael Heuer UC Berkeley, USA Phil Ewels SciLifeLab, Sweden Simon Heumos QBiC, University of Tübingen, Germany Stephen Kelly Memorial Sloan Kettering Cancer Center, USA Thanh Le Viet Quadram Institute, UK

Listed in alphabetical order

Contributions and Support

If you would like to contribute to this pipeline, please see the contributing guidelines.

For further information or help, don’t hesitate to get in touch on Slack (you can join with this invite).

Citation

If you use nf-core/viralrecon for your analysis, please cite it using the following doi: 10.5281/zenodo.3872730

An extensive list of references for the tools used by the pipeline can be found in the CITATIONS.md file.

You can cite the nf-core publication as follows:

The nf-core framework for community-curated bioinformatics pipelines.

Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.

Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x.
ReadCube: Full Access Link

""" ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "nf-core/viralrecon" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andreas Wilm and October SESSIONS and Paola Florez DE SESSIONS and ZHU Yuan and Shuzhen SIM and CHU Wenhan Collins" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-09-29T08:26:39Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:53:23Z"^^ns1:Date ; ns1:description "This PyCOMPSs workflow tutorial aims to illustrate the process of setting up a simulation system containing a protein, step by step, using the BioExcel Building Blocks library (biobb) in PyCOMPSs for execution on HPC. Three variants of the MD Setup workflows are included, supporting a list of structures, a list of mutations, or a cumulative set of mutations. " ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.200.1" ; ns1:isPartOf ; ns1:keywords "molecular dynamics, GROMACS, BioBB" ; ns1:license ; ns1:name "Protein MD Setup HPC tutorial using BioExcel Building Blocks (biobb) in PyCOMPSs" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "KNIME" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-09-29T14:25:42Z"^^ns1:Date ; ns1:dateModified "2022-04-11T09:27:55Z"^^ns1:Date ; ns1:description """This is an experimental KNIME workflow of using the BioExcel building blocks to implement the Protein MD Setup tutorial for molecular dynamics with GROMACS. Note that this workflow won't import in KNIME without the [experimental KNIME nodes](https://bioexcel.eu/research/projects/biobb_knime/) for BioBB - contact Adam Hospital for details.""" ; ns1:encodingFormat "application/zip" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.201.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Protein MD Setup tutorial using BioExcel Building Blocks (biobb) in KNIME" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-14T14:42:23Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:25Z"^^ns1:Date ; ns1:description """

nf-core/vipr

Build Status Nextflow Gitter

install with bioconda Docker Container available https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg

nf-core/vipr is a bioinformatics best-practice analysis pipeline for assembly and intrahost / low-frequency variant calling for viral samples.

The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible.

Pipeline Steps

Step Main program/s Trimming, combining of read-pairs per sample and QC Skewer, FastQC Decontamination decont Metagenomics classification / Sample purity Kraken Assembly to contigs BBtools’ Tadpole Assembly polishing ViPR Tools Mapping to assembly BWA, LoFreq Low frequency variant calling LoFreq Coverage and variant AF plots (two processes) Bedtools, ViPR Tools

Documentation

Documentation about the pipeline can be found in the docs/ directory:

  1. Installation and configuration
  2. Running the pipeline
  3. Output and how to interpret the results

Credits

This pipeline was originally developed by Andreas Wilm (andreas-wilm) at Genome Institute of Singapore.
It started out as an ecosystem around LoFreq and went through a couple of iterations.
The current version had three predecessors ViPR 1, ViPR 2 and ViPR 3.

An incomplete list of publications using (previous versions of) ViPR:

Plenty of people provided essential feedback, including:

""" ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "nf-core/vipr" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-10-19T10:47:54Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:53:24Z"^^ns1:Date ; ns1:description """This notebook is about pre-processing the Auditory Brainstem Response (ABR) raw data files provided by [Ingham et. al](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000194) to create a data set for Deep Learning models. The unprocessed ABR data files are available at [Dryad](https://datadryad.org/stash/dataset/doi:10.5061/dryad.cv803rv). Since the ABR raw data are available as zip-archives, these have to be unzipped and the extracted raw data files parsed so that the time series corresponding to the ABR audiograms can be saved in a single csv file. The final data set contains the ABR time series, an individual mouse identifier, stimulus frequency, stimulus sound pressure level (SPL) and a manually determined hearing threshold. For each mouse there are different time series corresponding to six different sound stimuli: broadband click, 6, 12, 18, 24, and 30 kHz, each of which was measured for a range of sound pressure levels. The exact range of sound levels can vary between the different mice and stimuli. The following is done: * The zip archives are unpacked. * The extracted ABR raw data files are parsed and collected in one csv file per archive. * The csv files are merged into a data set of time series. Each time series corresponds to an ABR audiogram measured for a mouse at a specific frequency and sound level. * The mouse phenotyping data are available in Excel format. The individual data sheets are combined into one mouse phenotyping data set, maintaining the mouse pipeline and the cohort type mapping. In addition, the hearing thresholds are added to the ABR audiogram data set. * The data sets are curated: * there is a single curve per mouse, stimulus frequency and sound level, * each sound level is included in the list of potential sound pressure levels, * for each mouse for which an ABR audiogram has been measured, mouse phenotyping data are also provided.""" ; ns1:keywords "ABR, DL" ; ns1:license ; ns1:name "Preparing a data set for Deep Learning from zipped ABR raw data files" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEM SBML" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Top Ranking Pathways" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rpSBML TAR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input BAM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BAM file information" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Converted FastQ reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC HTML" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC text file" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-10-17T02:51:00Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# BAM-to-FASTQ-QC ## General recommendations for using BAM-to-FASTQ-QC Please see the [`Genome assembly with hifiasm on Galaxy Australia`](https://australianbiocommons.github.io/how-to-guides/genome_assembly/hifi_assembly) guide. ## Acknowledgements The workflow & the [doc_guidelines template used](https://github.com/AustralianBioCommons/doc_guidelines) are supported by the Australian BioCommons via Bioplatforms Australia funding, the Australian Research Data Commons (https://doi.org/10.47486/PL105) and the Queensland Government RICF programme. Bioplatforms Australia and the Australian Research Data Commons are enabled by the National Collaborative Research Infrastructure Strategy (NCRIS). """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.220.2" ; ns1:input ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "BAM, FASTQ, Conversion, QC" ; ns1:license ; ns1:name "BAM to FASTQ + QC v1.0" ; ns1:output , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FASTQ input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFi Adapter Filter on input dataset(s): blocklist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFi Adapter Filter on input dataset(s): clean reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFi Adapter Filter on input dataset(s): contaminant blastout" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFi Adapter Filter on input dataset(s): contaminant statistic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "converted FASTA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "primary bandage image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "primary bandage info" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-10-21T05:13:12Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# PacBio HiFi genome assembly using hifiasm v2.1 ## General usage recommendations Please see the [Genome assembly with hifiasm on Galaxy Australia](https://australianbiocommons.github.io/how-to-guides/genome_assembly/hifi_assembly) guide. ## See [change log](./change_log.md) ## Acknowledgements The workflow & the [doc_guidelines template used](https://github.com/AustralianBioCommons/doc_guidelines) are supported by the Australian BioCommons via Bioplatforms Australia funding, the Australian Research Data Commons (https://doi.org/10.47486/PL105) and the Queensland Government RICF programme. Bioplatforms Australia and the Australian Research Data Commons are enabled by the National Collaborative Research Infrastructure Strategy (NCRIS). """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.221.3" ; ns1:input ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "FASTQ, hifiasm, HiFi, genome_assembly" ; ns1:license ; ns1:name "PacBio HiFi genome assembly using hifiasm v2.1" ; ns1:output , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file: Illumina reads R1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file: Illumina reads R2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file: long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC on input dataset(s): Webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "NanoPlot on input dataset(s): HTML report" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T04:34:47Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Data QC step, can run alone or as part of a combined workflow for large genome assembly. * What it does: Reports statistics from sequencing reads. * Inputs: long reads (fastq.gz format), short reads (R1 and R2) (fastq.gz format). * Outputs: For long reads: a nanoplot report (the HTML report summarizes all the information). For short reads: a MultiQC report. * Tools used: Nanoplot, FastQC, MultiQC. * Input parameters: None required. * Workflow steps: Long reads are analysed by Nanoplot; Short reads (R1 and R2) are analysed by FastQC; the resulting reports are processed by MultiQC. * Options: see the tool settings options at runtime and change as required. Alternative tool option: fastp Infrastructure_deployment_metadata: Galaxy Australia (Galaxy) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.222.1" ; ns1:image ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Data QC" ; ns1:output , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Illumina reads R1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope on input dataset(s) Linear plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope on input dataset(s) Log plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope on input dataset(s) Transformed linear plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope on input dataset(s) Transformed log plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl on input dataset(s): read-db.meryldb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T04:47:27Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Kmer counting step, can run alone or as part of a combined workflow for large genome assembly. * What it does: Estimates genome size and heterozygosity based on counts of kmers * Inputs: One set of short reads: e.g. R1.fq.gz * Outputs: GenomeScope graphs * Tools used: Meryl, GenomeScope * Input parameters: None required * Workflow steps: The tool meryl counts kmers in the input reads (k=21), then converts this into a histogram. GenomeScope: runs a model on the histogram; reports estimates. k-mer size set to 21. * Options: Use a different kmer counting tool. e.g. khmer. Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.223.1" ; ns1:image ; ns1:input ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "kmer counting - meryl" ; ns1:output , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Illumina reads R1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Illumina reads R2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp filtered R1 reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp filtered R2 reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp filtered long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp report on long reads html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp report on long reads json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp report on short reads html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp report on short reads json" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T04:56:09Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Trim and filter reads; can run alone or as part of a combined workflow for large genome assembly. * What it does: Trims and filters raw sequence reads according to specified settings. * Inputs: Long reads (format fastq); Short reads R1 and R2 (format fastq) * Outputs: Trimmed and filtered reads: fastp_filtered_long_reads.fastq.gz (But note: no trimming or filtering is on by default), fastp_filtered_R1.fastq.gz, fastp_filtered_R2.fastq.gz * Reports: fastp report on long reads, html; fastp report on short reads, html * Tools used: fastp (Note. The latest version (0.20.1) of fastp has an issue displaying plot results. Using version 0.19.5 here instead until this is rectified). * Input parameters: None required, but recommend removing the long reads from the workflow if not using any trimming/filtering settings. Workflow steps: Long reads: fastp settings: * These settings have been changed from the defaults (so that all filtering and trimming settings are now disabled). * Adapter trimming options: Disable adapter trimming: yes * Filter options: Quality filtering options: Disable quality filtering: yes * Filter options: Length filtering options: Disable length filtering: yes * Read modification options: PolyG tail trimming: Disable * Output options: output JSON report: yes Short reads: fastp settings: * adapter trimming (default setting: adapters are auto-detected) * quality filtering (default: phred quality 15), unqualified bases limit (default = 40%), number of Ns allowed in a read (default = 5) * length filtering (default length = min 15) * polyG tail trimming (default = on for NextSeq/NovaSeq data which is auto detected) * Output options: output JSON report: yes Options: * Change any settings in fastp for any of the input reads. * Adapter trimming: input the actual adapter sequences. (Alternative tool for long read adapter trimming: Porechop.) * Trimming n bases from ends of reads if quality less than value x (Alternative tool for trimming long reads: NanoFilt.) * Discard post-trimmed reads if length is < x (e.g. for long reads, 1000 bp) * Example filtering/trimming that you might do on long reads: remove adapters (can also be done with Porechop), trim bases from ends of the reads with low quality (can also be done with NanoFilt), after this can keep only reads of length x (e.g. 1000 bp) Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.224.1" ; ns1:image ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Trim and filter reads - fastp" ; ns1:output , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bandage Image on input dataset(s): Assembly Graph Image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bar chart showing contig sizes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly on input dataset(s) (Graphical Fragment Assembly)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly on input dataset(s) (assembly_graph)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly on input dataset(s) (assembly_info)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly on input dataset(s) (consensus)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly on input dataset(s) (log)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): PDF report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): Log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): tabular report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T05:07:16Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Assembly with Flye; can run alone or as part of a combined workflow for large genome assembly. * What it does: Assembles long reads with the tool Flye * Inputs: long reads (may be raw, or filtered, and/or corrected); fastq.gz format * Outputs: Flye assembly fasta; Fasta stats on assembly.fasta; Assembly graph image from Bandage; Bar chart of contig sizes; Quast reports of genome assembly * Tools used: Flye, Fasta statistics, Bandage, Bar chart, Quast * Input parameters: None required, but recommend setting assembly mode to match input sequence type Workflow steps: * Long reads are assembled with Flye, using default tool settings. Note: the default setting for read type ("mode") is nanopore raw. Change this at runtime if required. * Statistics are computed from the assembly.fasta file output, using Fasta Statistics and Quast (is genome large: Yes; distinguish contigs with more that 50% unaligned bases: no) * The graphical fragment assembly file is visualized with the tool Bandage. * Assembly information sent to bar chart to visualize contig sizes Options * See other Flye options. * Use a different assembler (in a different workflow). * Bandage image options - change size (max size is 32767), labels - add (e.g. node lengths). You can also install Bandage on your own computer and donwload the "graphical fragment assembly" file to view in greater detail. Infrastructure_deployment_metadata: Galaxy Australia (Galaxy) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.225.1" ; ns1:image ; ns1:input ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Assembly with Flye" ; ns1:output , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-04-19T01:34:42Z"^^ns1:Date ; ns1:dateModified "2026-04-19T01:44:02Z"^^ns1:Date ; ns1:description """Assembly polishing; can run alone or as part of a combined workflow for large genome assembly. * What it does: Polishes (corrects) an assembly, using long reads (with the tools Racon and Medaka) and short reads (with the tool Racon). (Note: medaka is only for nanopore reads, not PacBio reads). * Inputs: assembly to be polished: assembly.fasta; long reads - the same set used in the assembly (e.g. may be raw or filtered) fastq.gz format; short reads, R1 only, in fastq.gz format * Outputs: Racon+Medaka+Racon polished_assembly. fasta; Fasta statistics after each polishing tool * Tools used: Minimap2, Racon, Fasta statistics, Medaka * Input parameters: None required. The Medaka model is set to r941_min_hac_g507 (R9.4.1 pore, MinION, Guppy 5 High Accuracy Calling). To use a different model, edit the workflow before importing. Workflow steps: -1- Polish with long reads: using Racon * Long reads and assembly contigs => Racon polishing (subworkflow): * minimap2 : long reads are mapped to assembly => overlaps.paf. * overaps, long reads, assembly => Racon => polished assembly 1 * using polished assembly 1 as input; repeat minimap2 + racon => polished assembly 2 * using polished assembly 2 as input, repeat minimap2 + racon => polished assembly 3 * using polished assembly 3 as input, repeat minimap2 + racon => polished assembly 4 * Racon long-read polished assembly => Fasta statistics * Note: The Racon tool panel can be a bit confusing and is under review for improvement. Presently it requires sequences (= long reads), overlaps (= the paf file created by minimap2), and target sequences (= the contigs to be polished) as per "usage" described here https://github.com/isovic/racon/blob/master/README.md * Note: Racon: the default setting for "output unpolished target sequences?" is No. This has been changed to Yes for all Racon steps in these polishing workflows. This means that even if no polishes are made in some contigs, they will be part of the output fasta file. * Note: the contigs output by Racon have new tags in their headers. For more on this see https://github.com/isovic/racon/issues/85. -2- Polish with long reads: using Medaka * Racon polished assembly + long reads => medaka polishing X1 => medaka polished assembly * Medaka polished assembly => Fasta statistics -3- Polish with short reads: using Racon * Short reads and Medaka polished assembly =>Racon polish (subworkflow): * minimap2: short reads (R1 only) are mapped to the assembly => overlaps.paf. Minimap2 setting is for short reads. * overlaps + short reads + assembly => Racon => polished assembly 1 * using polished assembly 1 as input; repeat minimap2 + racon => polished assembly 2 * Racon short-read polished assembly => Fasta statistics Options * Change settings for Racon long read polishing if using PacBio reads: The default profile setting for Racon long read polishing: minimap2 read mapping is "Oxford Nanopore read to reference mapping", which is specified as an input parameter to the whole Assembly polishing workflow, as text: map-ont. If you are not using nanopore reads and/or need a different setting, change this input. To see the other available settings, open the minimap2 tool, find "Select a profile of preset options", and click on the drop down menu. For each described option, there is a short text in brackets at the end (e.g. map-pb). This is the text to enter into the assembly polishing workflow at runtime instead of the default (map-ont). * Other options: change the number of polishes (in Racon and/or Medaka). There are ways to assess how much improvement in assembly quality has occurred per polishing round (for example, the number of corrections made; the change in Busco score - see section Genome quality assessment for more on Busco). * Option: change polishing settings for any of these tools. Note: for Racon - these will have to be changed within those subworkflows first. Then, in the main workflow, update the subworkflows, and re-save. Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.226.2" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Assembly polishing" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly to be polished" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minimap setting (for long reads) " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly polished by long reads using Racon" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T05:45:09Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Assembly polishing subworkflow: Racon polishing with long reads Inputs: long reads and assembly contigs Workflow steps: * minimap2 : long reads are mapped to assembly => overlaps.paf. * overaps, long reads, assembly => Racon => polished assembly 1 * using polished assembly 1 as input; repeat minimap2 + racon => polished assembly 2 * using polished assembly 2 as input, repeat minimap2 + racon => polished assembly 3 * using polished assembly 3 as input, repeat minimap2 + racon => polished assembly 4 * Racon long-read polished assembly => Fasta statistics * Note: The Racon tool panel can be a bit confusing and is under review for improvement. Presently it requires sequences (= long reads), overlaps (= the paf file created by minimap2), and target sequences (= the contigs to be polished) as per "usage" described here https://github.com/isovic/racon/blob/master/README.md * Note: Racon: the default setting for "output unpolished target sequences?" is No. This has been changed to Yes for all Racon steps in these polishing workflows. This means that even if no polishes are made in some contigs, they will be part of the output fasta file. * Note: the contigs output by Racon have new tags in their headers. For more on this see https://github.com/isovic/racon/issues/85. Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.227.1" ; ns1:image ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Racon polish with long reads, x4" ; ns1:output ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly to be polished" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Illumina reads, R1, in fastq.gz format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly polished by short reads using Racon" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T05:50:40Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Assembly polishing subworkflow: Racon polishing with short reads Inputs: short reads and assembly (usually pre-polished with other tools first, e.g. Racon + long reads; Medaka) Workflow steps: * minimap2: short reads (R1 only) are mapped to the assembly => overlaps.paf. Minimap2 setting is for short reads. * overlaps + short reads + assembly => Racon => polished assembly 1 * using polished assembly 1 as input; repeat minimap2 + racon => polished assembly 2 * Racon short-read polished assembly => Fasta statistics Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.228.1" ; ns1:image ; ns1:input , ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Racon polish with Illumina reads, x2" ; ns1:output ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Polished assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco short summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco summary image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): PDF report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): Log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast on input dataset(s): tabular report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T06:03:05Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """Assess genome quality; can run alone or as part of a combined workflow for large genome assembly. * What it does: Assesses the quality of the genome assembly: generate some statistics and determine if expected genes are present; align contigs to a reference genome. * Inputs: polished assembly; reference_genome.fasta (e.g. of a closely-related species, if available). * Outputs: Busco table of genes found; Quast HTML report, and link to Icarus contigs browser, showing contigs aligned to a reference genome * Tools used: Busco, Quast * Input parameters: None required Workflow steps: Polished assembly => Busco * First: predict genes in the assembly: using Metaeuk * Second: compare the set of predicted genes to the set of expected genes in a particular lineage. Default setting for lineage: Eukaryota Polished assembly and a reference genome => Quast * Contigs/scaffolds file: polished assembly * Type of assembly: Genome * Use a reference genome: Yes * Reference genome: Arabidopsis genome * Is the genome large (> 100Mbp)? Yes. * All other settings as defaults, except second last setting: Distinguish contigs with more than 50% unaligned bases as a separate group of contigs?: change to No Options Gene prediction: * Change tool used by Busco to predict genes in the assembly: instead of Metaeuk, use Augustus. * To do this: select: Use Augustus; Use another predefined species model; then choose from the drop down list. * Select from a database of trained species models. list here: https://github.com/Gaius-Augustus/Augustus/tree/master/config/species * Note: if using Augustus: it may fail if the input assembly is too small (e.g. a test-size data assembly). It can't do the training part properly. Compare genes found to other lineage: * Busco has databases of lineages and their expected genes. Option to change lineage. * Not all lineages are available - there is a mix of broader and narrower lineages. - list of lineages here: https://busco.ezlab.org/list_of_lineages.html. * To see the groups in taxonomic hierarchies: Eukaryotes: https://busco.ezlab.org/frames/euka.htm * For example, if you have a plant species from Fabales, you could set that as the lineage. * The narrower the taxonomic group, the more total genes are expected. Infrastructure_deployment_metadata: Galaxy Australia (Galaxy) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.229.1" ; ns1:image ; ns1:input , ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Assess genome quality" ; ns1:output , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-29T10:01:25Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:28Z"^^ns1:Date ; ns1:description "Given a set of pathways generated by RetroPath2.0, this workflow informs the user as to the theoretically best performing ones based on four criteria: FBA, thermodynamic feasibility, length of the pathway, and reaction rule score." ; ns1:input , , ; ns1:keywords "Retrosynthesis" ; ns1:license ; ns1:name "Pathway Analysis" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEM SBML" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of genetic parts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Max (UNIPROT) enzymes per reactions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Top pathways to convert to SBOL" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rpSBML TAR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pathway SBOLs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-08T06:08:25Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:54:36Z"^^ns1:Date ; ns1:description """Combined workflow for large genome assembly The tutorial document for this workflow is here: https://doi.org/10.5281/zenodo.5655813 What it does: A workflow for genome assembly, containing subworkflows: * Data QC * Kmer counting * Trim and filter reads * Assembly with Flye * Assembly polishing * Assess genome quality Inputs: * long reads and short reads in fastq format * reference genome for Quast Outputs: * Data information - QC, kmers * Filtered, trimmed reads * Genome assembly, assembly graph, stats * Polished assembly, stats * Quality metrics - Busco, Quast Options * Omit some steps - e.g. Data QC and kmer counting * Replace a module with one using a different tool - e.g. change assembly tool Infrastructure_deployment_metadata: Galaxy Australia (Galaxy)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.230.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "Large-genome-assembly" ; ns1:license ; ns1:name "Combined workflows for large genome assembly" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "\"Input Dataset Collection\"" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-11-10T00:15:14Z"^^ns1:Date ; ns1:dateModified "2024-04-17T04:19:28Z"^^ns1:Date ; ns1:description "MetaDEGalaxy: Galaxy workflow for differential abundance analysis of 16s metagenomic data" ; ns1:input ; ns1:isPartOf ; ns1:keywords "MetaDEGalaxy" ; ns1:license ; ns1:name "16S_biodiversity_for_overlap_paired_end" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "\"Input Dataset Collection\"" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-11-10T00:16:43Z"^^ns1:Date ; ns1:dateModified "2024-04-17T04:17:52Z"^^ns1:Date ; ns1:description "" ; ns1:input ; ns1:isPartOf ; ns1:keywords "MetaDEGalaxy" ; ns1:license ; ns1:name "16S_biodiversity_for_nonoverlap_paired_end" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Rob Edwards" . a ns1:Person ; ns1:name "Scott Handley" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2021-11-11T03:37:33Z"^^ns1:Date ; ns1:dateModified "2024-05-13T02:08:43Z"^^ns1:Date ; ns1:description """A hecatomb is a great sacrifice or an extensive loss. Heactomb the software empowers an analyst to make data driven decisions to 'sacrifice' false-positive viral reads from metagenomes to enrich for true-positive viral reads. This process frequently results in a great loss of suspected viral sequences / contigs. For information about installation, usage, tutorial etc please refer to the documentation: https://hecatomb.readthedocs.io/en/latest/ ### Quick start guide Install Hecatomb from Bioconda ```bash # create an env called hecatomb and install Hecatomb in it conda create -n hecatomb -c conda-forge -c bioconda hecatomb # activate conda env conda activate hecatomb # check the installation hecatomb -h # download the databases - you only have to do this once hecatomb install # Run the test dataset hecatomb run --test ```""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.235.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Hecatomb" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Michael Franklin; Jiaan Yu; Juny Kesumadewi" . a ns1:ComputerLanguage ; ns1:name "Janis" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "align_and_sort_sortsam_tmpDir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cutadapt_adapters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastqs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_intervals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "known_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mills_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snps_1000gp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snps_dbsnp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_bam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_fastqc_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_performance_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_variants_bamstats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_variants_gatk" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_variants_gatk_split" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-11-12T02:30:06Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:54:51Z"^^ns1:Date ; ns1:description """This is a genomics pipeline to do a single germline sample variant-calling, adapted from GATK Best Practice Workflow. This workflow is a reference pipeline for using the Janis Python framework (pipelines assistant). - Alignment: bwa-mem - Variant-Calling: GATK HaplotypeCaller - Outputs the final variants in the VCF format. **Resources** This pipeline has been tested using the HG38 reference set, available on Google Cloud Storage through: - https://console.cloud.google.com/storage/browser/genomics-public-data/references/hg38/v0/ This pipeline expects the assembly references to be as they appear in that storage (".fai", ".amb", ".ann", ".bwt", ".pac", ".sa", "^.dict"). The known sites (snps_dbsnp, snps_1000gp, known_indels, mills_indels) should be gzipped and tabix indexed. Infrastructure_deployment_metadata: Spartan (Unimelb)""" ; ns1:image ; ns1:input , , , , , , , , , ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Janis Germline Variant-Calling Workflow (GATK)" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Gareth Price" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFi reads as FASTQ" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiFiASM 1o assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purge overlaps on input dataset(s): purge_dups bed file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purge overlaps on input dataset(s): purge_dups log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-10-17T02:53:20Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# Purge-duplicates-from-hifiasm-assembly ## General recommendations for using `Purge-duplicates-from-hifiasm-assembly` Please see the [`Genome assembly with hifiasm on Galaxy Australia`](https://australianbiocommons.github.io/how-to-guides/genome_assembly/hifi_assembly) guide. ## Acknowledgements The workflow & the [doc_guidelines template used](https://github.com/AustralianBioCommons/doc_guidelines) are supported by the Australian BioCommons via Bioplatforms Australia funding, the Australian Research Data Commons (https://doi.org/10.47486/PL105) and the Queensland Government RICF programme. Bioplatforms Australia and the Australian Research Data Commons are enabled by the National Collaborative Research Infrastructure Strategy (NCRIS). """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.237.2" ; ns1:input , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "Assembly, purge_dups, HiFi" ; ns1:license ; ns1:name "Purge duplicates from hifiasm assembly v1.0" ; ns1:output , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Andrea Zaliani" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-11-16T10:19:16Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:00Z"^^ns1:Date ; ns1:description """# Summary This notebook demonstrates how to retrieve metadata associated to the paper [A SARS-CoV-2 cytopathicity dataset generated by high-content screening of a large drug repurposing collection](https://doi.org/10.1038/s41597-021-00848-4) and available in IDR at [idr0094-ellinger-sarscov2](https://idr.openmicroscopy.org/search/?query=Name:idr0094). Over 300 compounds were used in this investigation. This notebook allows the user to calculate the half maximal inhibitory concentration (IC50) for each compound. IC50 is a measure of the potency of a substance in inhibiting a specific biological or biochemical function. IC50 is a quantitative measure that indicates how much of a particular inhibitory substance (e.g. drug) is needed to inhibit, in vitro, a given biological process or biological component by 50%. User can download the IC50 for each compound used in that study The notebook can be launched in [My Binder](https://mybinder.org/v2/gh/IDR/idr0094-ellinger-sarscov2/master?urlpath=notebooks%2Fnotebooks%2Fidr0094-ic50.ipynb%3FscreenId%3D2603). A shiny app is also available for dynamic plotting of the IC50 curve for each compound. This R shiny app can be launched in [My Binder](https://mybinder.org/v2/gh/IDR/idr0094-ellinger-sarscov2/master?urlpath=shiny/apps/) # Inputs Parameters needed to configure the workflow: **screenId**: Identifier of a screen in IDR. # Ouputs Output file generated: **ic50.csv**: Comma separate value file containing the IC50 for each compound. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.238.1" ; ns1:isPartOf ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Calculate the half maximal inhibitory concentration (IC50) for each compound used in a SARS-CoV-2 study" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "readgroup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sorted_bam" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-19T10:06:37Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:02Z"^^ns1:Date ; ns1:description """Exome Alignment Workflow """ ; ns1:image ; ns1:input , , , ; ns1:keywords "cancer, pediatric, Alignment" ; ns1:license ; ns1:name "exome-alignment" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-29T10:05:20Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:35Z"^^ns1:Date ; ns1:description "This workflow converts the top-ranking predicted pathways from the \"RetroSynthesis\" and \"Pathway Analysis\" workflows to plasmids intended to be expressed in the specified organism" ; ns1:input , , , , ; ns1:keywords "Retrosynthesis, genetic design, pathway prediction" ; ns1:license ; ns1:name "Genetic Design" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEM SBML" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximal Pathway Length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Target InChI" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SBML Pathways" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-19T10:11:36Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:08Z"^^ns1:Date ; ns1:description "Exome SAMtools Workflow" ; ns1:keywords "cancer, pediatric, SAMTools" ; ns1:license ; ns1:name "exome-samtools" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-11-21T05:26:02Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:08Z"^^ns1:Date ; ns1:description """`atavide` is a complete workflow for metagenomics data analysis, including QC/QA, optional host removal, assembly and cross-assembly, and individual read based annotations. We have also built in some advanced analytics including tools to assign annotations from reads to contigs, and to generate metagenome-assembled genomes in several different ways, giving you the power to explore your data! `atavide` is 100% snakemake and conda, so you only need to install the snakemake workflow, and then everything else will be installed with conda. Steps: 1. QC/QA with [prinseq++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) 2. optional host removal using bowtie2 and samtools, [as described previously](https://edwards.flinders.edu.au/command-line-deconseq/). To enable this, you need to provide a path to the host db and a host db. Metagenome assembly 1. pairwise assembly of each sample using [megahit](https://github.com/voutcn/megahit) 2. extraction of all reads that do not assemble using samtools flags 3. assembly of all unassembled reads using [megahit](https://github.com/voutcn/megahit) 4. compilation of _all_ contigs into a single unified set using [Flye](https://github.com/fenderglass/Flye) 5. comparison of reads -> contigs to generate coverage MAG creation 1. [metabat](https://bitbucket.org/berkeleylab/metabat/src/master/) 2. [concoct](https://github.com/BinPro/CONCOCT) 3. Pairwise comparisons using [turbocor](https://github.com/dcjones/turbocor) followed by clustering Read-based annotations 1. [Kraken2](https://ccb.jhu.edu/software/kraken2/) 2. [singlem](https://github.com/wwood/singlem) 3. [SUPER-focus](https://github.com/metageni/SUPER-FOCUS) 4. [FOCUS](https://github.com/metageni/FOCUS) Want something else added to the suite? File an issue on github and we'll add it ASAP! ### Installation You will need to install 1. The NCBI taxonomy database somewhere 2. The superfocus databases somewhere, and set the SUPERFOCUS_DB environmental variable Everything else should install automatically.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.241.1" ; ns1:keywords "" ; ns1:license ; ns1:name "atavide" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-11-23T11:28:07Z"^^ns1:Date ; ns1:dateModified "2026-02-19T16:17:04Z"^^ns1:Date ; ns1:description """# Summary This notebook shows how to integrate genomic and image data resources. This notebook looks at the question **Which diabetes related genes are expressed in the pancreas?** Steps: * Query humanmine.org, an integrated database of Homo sapiens genomic data using the intermine API to find the genes. * Using the list of found genes, search in the Image Data Resource (IDR) for images linked to the genes, tissue and disease. * We use the [intermine Python API](https://github.com/intermine/intermine-ws-python) and the IDR Python API. The notebook can be opened in [Colab](https://colab.research.google.com/github/IDR/idr-notebooks/blob/master/humanmine.ipynb) # Inputs Parameters needed to configure the workflow: * TISSUE = "Pancreas" * DISEASE = "diabetes" # Ouputs * List of genes found using [HumanMine](https://pubmed.ncbi.nlm.nih.gov/35820040/) * List of images from IDR for one of the gene found""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.242.1" ; ns1:image ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Diabetes related genes expressed in pancreas" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Javier Garrayo-Ventas" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2021-11-29T15:21:23Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:10Z"^^ns1:Date ; ns1:description """## Description The workflow takes an input file with Cancer Driver Genes predictions (i.e. the results provided by a participant), computes a set of metrics, and compares them against the data currently stored in OpenEBench within the TCGA community. Two assessment metrics are provided for that predictions. Also, some plots (which are optional) that allow to visualize the performance of the tool are generated. The workflow consists in three standard steps, defined by OpenEBench. The tools needed to run these steps are containerised in three Docker images, whose recipes are available in the [TCGA_benchmarking_dockers](https://github.com/inab/TCGA_benchmarking_dockers ) repository and the images are stored in the [INB GitLab container registry](https://gitlab.bsc.es/inb/elixir/openebench/workflows/tcga_benchmarking_dockers/container_registry) . Separated instances are spawned from these images for each step: 1. **Validation**: the input file format is checked and, if required, the content of the file is validated (e.g check whether the submitted gene IDs exist) 2. **Metrics Generation**: the predictions are compared with the 'Gold Standards' provided by the community, which results in two performance metrics - precision (Positive Predictive Value) and recall(True Positive Rate). 3. **Consolidation**: the benchmark itself is performed by merging the tool metrics with the rest of TCGA data. The results are provided in JSON format and SVG format (scatter plot). ![OpenEBench benchmarking workflow](https://raw.githubusercontent.com/inab/TCGA_benchmarking_workflow/1.0.8/workflow_schema.jpg) ## Data * [TCGA_sample_data](./TCGA_sample_data) folder contains all the reference data required by the steps. It is derived from the manuscript: [Comprehensive Characterization of Cancer Driver Genes and Mutations](https://www.cell.com/cell/fulltext/S0092-8674%2818%2930237-X?code=cell-site), Bailey et al, 2018, Cell [![doi:10.1016/j.cell.2018.02.060](https://img.shields.io/badge/doi-10.1016%2Fj.cell.2018.02.060-green.svg)](https://doi.org/10.1016/j.cell.2018.02.060) * [TCGA_sample_out](./TCGA_sample_out) folder contains an example output for a worklow run, with two cancer types / challenges selected (ACC, BRCA). Results obtained from the default execution should be similar to those ones available in this directory. Results found in [TCGA_sample_out/results](./TCGA_sample_out/results) can be visualized in the browser using [`benchmarking_workflows_results_visualizer` javascript library](https://github.com/inab/benchmarking_workflows_results_visualizer). ## Requirements This workflow depends on three tools that have to be installed before you can run it: * [Git](https://git-scm.com/downloads): Used to download the workflow from GitHub. * [Docker](https://docs.docker.com/get-docker/): The Docker Engine is used under the hood to execute the containerised steps of the benchmarking workflow. * [Nextflow](https://www.nextflow.io/): Is the technology used to write and execute the benchmarking workflow. Note that it depends on Bash (>=3.2) and Java (>=8 , <=17). We provide the script [run_local_nextflow.bash](run_local_nextflow.bash) that automates their installation for local testing. Check that these tools are available in your environment: ``` # Git > which git /usr/bin/git > git --version git version 2.26.2 # Docker > which docker /usr/bin/docker > docker --version Docker version 20.10.9-ce, build 79ea9d308018 # Nextflow > which nextflow /home/myuser/bin/nextflow > nextflow -version N E X T F L O W version 21.04.1 build 5556 created 14-05-2021 15:20 UTC (17:20 CEST) cite doi:10.1038/nbt.3820 http://nextflow.io ``` In the case of docker, apart from being installed the daemon has to be running. On Linux distributions that use `Systemd` for service management, which includes the most popular ones as of 2021 (Ubuntu, Debian, CentOs, Red Hat, OpenSuse), the `systemctl` command can be used to check its status and manage it: ``` # Check status of docker daemon > sudo systemctl status docker ● docker.service - Docker Application Container Engine Loaded: loaded (/usr/lib/systemd/system/docker.service; disabled; vendor preset: disabled) Active: inactive (dead) Docs: http://docs.docker.com # Start docker daemon > sudo systemctl start docker ``` ### Download workflow Simply clone the repository and check out the latest tag (currently `1.0.8`): ``` # Clone repository > git clone https://github.com/inab/TCGA_benchmarking_dockers.git # Move to new directory cd TCGA_benchmarking_workflow/ # Checkout version 1.0.8 > git checkout 1.0.8 -b 1.0.8 ``` ## Usage The workflow can be run workflow in two different ways: * Standard: `nextflow run main.nf -profile docker` * Using the bash script that installs Java and Nextflow:`./run_local_nextflow.bash run main.nf -profile docker`. Arguments specifications: ``` Usage: Run the pipeline with default parameters: nextflow run main.nf -profile docker Run with user parameters: nextflow run main.nf -profile docker --predictionsFile {driver.genes.file} --public_ref_dir {validation.reference.file} --participant_name {tool.name} --metrics_ref_dir {gold.standards.dir} --cancer_types {analyzed.cancer.types} --assess_dir {benchmark.data.dir} --results_dir {output.dir} Mandatory arguments: --input List of cancer genes prediction --community_id Name or OEB permanent ID for the benchmarking community --public_ref_dir Directory with list of cancer genes used to validate the predictions --participant_id Name of the tool used for prediction --goldstandard_dir Dir that contains metrics reference datasets for all cancer types --challenges_ids List of types of cancer selected by the user, separated by spaces --assess_dir Dir where the data for the benchmark are stored Other options: --validation_result The output directory where the results from validation step will be saved --augmented_assess_dir Dir where the augmented data for the benchmark are stored --assessment_results The output directory where the results from the computed metrics step will be saved --outdir The output directory where the consolidation of the benchmark will be saved --statsdir The output directory with nextflow statistics --data_model_export_dir The output dir where json file with benchmarking data model contents will be saved --otherdir The output directory where custom results will be saved (no directory inside) Flags: --help Display this message ``` Default input parameters and Docker images to use for each step can be specified in the [config](./nextflow.config) file. **NOTE: In order to make your workflow compatible with the [OpenEBench VRE Nextflow Executor](https://github.com/inab/vre-process_nextflow-executor), please make sure to use the same parameter names in your workflow.** """ ; ns1:isBasedOn ; ns1:keywords "tcga, openebench, benchmarking" ; ns1:license ; ns1:name "OpenEBench TCGA Cancer Driver Genes benchmarking workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Catalog number" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Higher classification" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Image URI" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Image license" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Institution URL" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Object type" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Person identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Person name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Rights holder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-11-26T14:46:05Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:55:15Z"^^ns1:Date ; ns1:description "" ; ns1:input , , , , , , , , ; ns1:keywords "Segmentation" ; ns1:license ; ns1:name "De novo digitisation" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Tim Dudgeon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/All fragments (SDF)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Candidate compounds (SMILES)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection size for docking" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Fragment for SuCOS scoring (SDF/MOL)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Number of poses" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Receptor (PDB)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SuCOS threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Docking poses" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Scored and filtered poses" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-02-17T03:01:49Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:35Z"^^ns1:Date ; ns1:description "Virtual screening of the SARS-CoV-2 main protease with rDock and pose scoring" ; ns1:input , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "fragment-based-docking-scoring/main" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:Person ; ns1:name "Simon Bray" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Apoprotein PDB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Force field" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Ligand SDF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Water model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Complex GRO" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Complex topology" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Position restraints file" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-12-21T03:01:09Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:32Z"^^ns1:Date ; ns1:description """# Protein-ligand complex parameterization Parameterizes an input protein (PDB) and ligand (SDF) file prior to molecular dynamics simulation with GROMACS. This is a simple workflow intended for use as a subworkflow in more complex MD workflows. It is used as a subworkflow by the GROMACS MMGBSA and dcTMD workflows. """ ; ns1:input , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "protein-ligand-complex-parameterization/main" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Simon Bray" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Apoprotein PDB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Force field" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Ligand SDF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NPT equilibration steps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NVT equilibration steps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Number of simulations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Production steps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Salt concentration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Water model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Complex GRO" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Complex topology" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GRO files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GROMACS setup (ITP) on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MMGBSA free energy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MMGBSA statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/XTC files" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-10T03:02:02Z"^^ns1:Date ; ns1:dateModified "2026-03-10T03:02:02Z"^^ns1:Date ; ns1:description "MMGBSA simulation and calculation" ; ns1:input , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "gromacs-mmgbsa/main" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:Person ; ns1:name "Simon Bray" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Force field" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ligand SDF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of equilibration steps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of simulations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of steps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Protein PDB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Protein pull group" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pull group pbcatom" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pulling rate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Salt concentration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Step length (ps)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Temperature" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Water model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pH to protonate ligand" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Free energy data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Friction data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GRO files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS setup (ITP) on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Structure file (GRO format, optional)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "TPR files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Topology" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "XTC files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "XVG files" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-02-20T03:02:00Z"^^ns1:Date ; ns1:dateModified "2026-02-20T03:02:00Z"^^ns1:Date ; ns1:description "Perform dcTMD free energy simulations and calculations" ; ns1:input , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "gromacs-dctmd/main" ; ns1:output , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-29T10:08:17Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:42Z"^^ns1:Date ; ns1:description "Generate possible metabolic routes for the production of a target molecule in an organism of choice" ; ns1:input , , ; ns1:keywords "Retrosynthesis, pathway prediction, pathway design, Synthetic Biology, metabolic engineering" ; ns1:license ; ns1:name "RetroSynthesis" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEM SBML" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximal Pathway Length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Target InChI" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Top Ranking Pathways" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:Person ; ns1:name "Mar Batlle" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "LFC" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Molti_Louvain" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Molti_modularity" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "approach" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "control_id" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "counts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "layers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_nodes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiXrank_r" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiXrank_selfloops" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "padj" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2021-12-14T10:01:12Z"^^ns1:Date ; ns1:dateModified "2023-04-21T11:42:40Z"^^ns1:Date ; ns1:description """
drawing
MultiAffinity enables the study of how gene dysregulation propagates on a multilayer network on a disease of interest, uncovering key genes. Find the detailed documentation for the tool [here](https://marbatlle.github.io/multiAffinity/). ![alt](https://github.com/marbatlle/multiAffinity/raw/main/docs/img/multiAffinity_workflow.png)""" ; ns1:image ; ns1:input , , , , , , , , , , , , ; ns1:keywords "cancer, pediatric, rna-seq, networks, community-detection" ; ns1:license ; ns1:name "multiAffinity" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-12-20T09:43:13Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:56:22Z"^^ns1:Date ; ns1:description "This workflow extracts 5 different time periods e.g. January- June 2019, 2020 and 2021, July-December 2019 and 2020 over a single selected location. Then statistics (mean, minimum, maximum) are computed. The final products are maximum, minimum and mean." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.251.1" ; ns1:image ; ns1:keywords "RELIANCE, copernicus, air-quality" ; ns1:license ; ns1:name "Investigation of lockdown effect on air quality between January 2019 to May 2021." ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "10_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "3_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "4_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "6_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "7_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "8_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "9_Input Parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CAMS-PM2_5-20211222_netcdf" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-12-29T07:57:46Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:56:30Z"^^ns1:Date ; ns1:description """Abstract CWL Automatically generated from the Galaxy workflow file: GTN 'Pangeo 101 for everyone - Introduction to Xarray'. In this tutorial, we analyze particle matter < 2.5 μm/m3 data from Copernicus Atmosphere Monitoring Service to understand Xarray Galaxy Tools: - Understand how an Xarray dataset is organized; - Get metadata from Xarray dataset such as variable names, units, coordinates (latitude, longitude, level), etc; - Plot an Xarray dataset on a geographical map and learn to customize it; - Select/Subset an Xarray dataset from coordinates values such as time selection or a subset over a geographical area; - Mask an Xarray dataset with a Where statement, for instance to only see PM2.5 > 30 μm/m and highlight on a map regions with "high" values; - Convert an Xarray dataset to Tabular data (pandas dataframe); - Plot tabular data to visualize the forecast PM2.5 over a single point (here Naples) using a scatterplot and/or climate stripes.""" ; ns1:image ; ns1:input , , , , , , , , , , ; ns1:keywords "GTN, Climate, copernicus, pangeo" ; ns1:license ; ns1:name "Pangeo 101 for everyone - introduction to Xarray" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "basecalling model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "configuration_command" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nanopore reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "identifier used" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kraken_database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye de novo assembler for single-molecule reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Guppy for CPU" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Krona taxonomy visualization" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Medaka polisher" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FASTQ files merged" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MinION-Quality-Check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "QUAlity assessment" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-01-06T07:36:38Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:56:37Z"^^ns1:Date ; ns1:description """### - deprecated - Workflow for sequencing with ONT Nanopore, from basecalling to assembly. - Guppy (basecalling of raw reads) - MinIONQC (quality check) - FASTQ merging from multi into one file - Kraken2 (taxonomic classification) - Krona (classification visualization) - Flye (de novo assembly) - Medaka (assembly polishing) - QUAST (assembly quality reports) **All tool CWL files and other workflows can be found here:**
Tools: https://git.wur.nl/unlock/cwl/-/tree/master/cwl
Workflows: https://git.wur.nl/unlock/cwl/-/tree/master/cwl/workflows
""" ; ns1:image ; ns1:input , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Nanopore Guppy Basecalling Assembly Workflow" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Basecalling model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run binning workflow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Deduplicate reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Contamination reference file(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier used" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "illumina forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "illumina reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum memory in MB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "When working with metagenomes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nanopore reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pilon fix list" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use mapped reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Binning output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read quality and filtering reports" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-04-20T09:12:09Z"^^ns1:Date ; ns1:dateModified "2023-02-02T15:16:21Z"^^ns1:Date ; ns1:description """#### - Deprecated - #### See our updated hybrid assembly workflow: https://workflowhub.eu/workflows/367 #### And other workflows: https://workflowhub.eu/projects/16#workflows # **Workflow for sequencing with ONT Nanopore data, from basecalled reads to (meta)assembly and binning** - Workflow Nanopore Quality - Kraken2 taxonomic classification of FASTQ reads - Flye (de-novo assembly) - Medaka (assembly polishing) - metaQUAST (assembly quality reports) **When Illumina reads are provided:** - Workflow Illumina Quality: https://workflowhub.eu/workflows/336?version=1 - Assembly polishing with Pilon
- Workflow binnning https://workflowhub.eu/workflows/64?version=11 - Metabat2 - CheckM - BUSCO - GTDB-Tk **All tool CWL files and other workflows can be found here:**
Tools: https://git.wur.nl/unlock/cwl/-/tree/master/cwl
Workflows: https://git.wur.nl/unlock/cwl/-/tree/master/cwl/workflows
""" ; ns1:image ; ns1:input , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "nanopore, Genomics, Metagenomics" ; ns1:license ; ns1:name "Nanopore Assembly Workflow - Deprecated -" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path_gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path_itp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path_top" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T12:33:33Z"^^ns1:Date ; ns1:dateModified "2026-03-24T12:36:28Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.255.3" ; ns1:image ; ns1:input , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL GMX Automatic Ligand Parameterization tutorial" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_fpocket_select_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_fpocket_select_input_pockets_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_fpocket_select_output_pocket_pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_fpocket_select_output_pocket_pqr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step2_box_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step2_box_output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step3_babel_convert_prep_lig_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step3_babel_convert_prep_lig_input_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step3_babel_convert_prep_lig_output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step4_str_check_add_hydrogens_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step4_str_check_add_hydrogens_input_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step4_str_check_add_hydrogens_output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step5_autodock_vina_run_output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step5_autodock_vina_run_output_pdbqt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step6_babel_convert_pose_pdb_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step6_babel_convert_pose_pdb_output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pocket_pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pocket_pqr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdbqt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-01-10T11:48:32Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:56:53Z"^^ns1:Date ; ns1:description "This workflow performs the process of protein-ligand docking, step by step, using the BioExcel Building Blocks library (biobb)." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.257.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Protein-ligand docking (fpocket)" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_itp_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_str_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_str_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_ndx_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_molecule_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_str_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_ndx_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T14:21:31Z"^^ns1:Date ; ns1:dateModified "2026-03-24T14:30:19Z"^^ns1:Date ; ns1:description """# Protein Ligand Complex MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/complex/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation system** containing a **protein in complex with a ligand**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **T4 lysozyme** L99A/M102Q protein (PDB code 3HTB), in complex with the **2-propylphenol** small molecule (3-letter Code JZ4). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.258.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein Ligand Complex MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pocket_pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pocket_pqr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdbqt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T15:24:38Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:28:35Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.259.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein-ligand Docking tutorial (Fpocket)" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-05-29T11:19:05Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:47Z"^^ns1:Date ; ns1:description "The workflow runs the RetroSynthesis algorithm to generate a collection of heterologous pathways in a host organism of choice, converts them to SBML files, performs analysis on the pathways to then rank the theoretical best performing ones." ; ns1:input , , , ; ns1:keywords "pathway prediction, pathway design, metabolic engineering, Synthetic Biology, Retrosynthesis" ; ns1:license ; ns1:name "Pathway Ranker" ; ns1:output , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Alexandre Almeida" . a ns1:Person ; ns1:name "Guillermo Rangel-Pineros and Ekaterina Sakharova" . a ns1:Person ; ns1:name "Martin Hölzer" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "add_hmms_tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hmmscan_database_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "img_blast_database_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_fasta_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mashmap_reference_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ncbi_tax_db_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pprmeta_simg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "virsorter_data_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "virsorter_virome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blast_merged_tsvs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blast_result_filtereds" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blast_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "high_confidence_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "high_confidence_faa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "krona_plot_all" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "krona_plots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "low_confidence_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "low_confidence_faa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mashmap_hits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "parse_prophages_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "prophages_faa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "taxonomy_assignations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "virfinder_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "virsorter_output_fastas" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_molecule_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T10:02:33Z"^^ns1:Date ; ns1:dateModified "2026-03-26T10:13:47Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.260.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Amber Protein MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_molecule_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:16:30Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:22:37Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.261.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Amber Protein Ligand Complex MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rst_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_mdinfo_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:49:12Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:55:27Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.262.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL ABC MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-01-17T10:44:28Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:57:28Z"^^ns1:Date ; ns1:description """# polya_liftover - sc/snRNAseq Snakemake Workflow A [Snakemake][sm] workflow for using PolyA_DB and UCSC Liftover with Cellranger. Some genes are not accurately annotated in the reference genome. Here, we use information provide by the [PolyA_DB v3.2][polya] to update the coordinates, then the [USCS Liftover][liftover] tool to update to a more recent genome. Next, we use [Cellranger][cr] to create the reference and count matrix. Finally, by taking advantage of the integrated [Conda][conda] and [Singularity][sing] support, we can run the whole thing in an isolated environment. Please see our [README][readme] for the full details! [sm]: https://snakemake.readthedocs.io/en/stable/index.html "Snakemake" [polya]: https://exon.apps.wistar.org/polya_db/v3/index.php "PolyA_DB" [liftover]: https://genome.ucsc.edu/cgi-bin/hgLiftOver "Liftover" [cr]: https://github.com/alexdobin/STAR "Cellranger" [conda]: https://docs.conda.io/en/latest/ "Conda" [sing]: https://sylabs.io/singularity/ "Singularity" [readme]: https://github.com/IMS-Bio2Core-Facility/polya_liftover/blob/main/README.md""" ; ns1:image ; ns1:keywords "Transcriptomics, scRNA-seq, Snakemake, FAIR workflows, FastQC, MultiQC, Cellranger, LiftOver" ; ns1:license ; ns1:name "polya_liftover" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-01-27T10:44:25Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:57:29Z"^^ns1:Date ; ns1:description """## Introduction **vibbits/rnaseq-editing** is a bioinformatics pipeline that can be used to analyse RNA sequencing data obtained from organisms with a reference genome and annotation followed by a prediction step of editing sites using RDDpred. The pipeline is largely based on the [nf-core RNAseq pipeline](https://nf-co.re/rnaseq/). The initial nf-core pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! ## Pipeline summary 1. Merge re-sequenced FastQ files ([`cat`](http://www.linfo.org/cat.html)) 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 3. Adapter and quality trimming ([`Trimmomatics`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) 4. Use of STAR for multiple alignment and quantification: [`STAR`](https://github.com/alexdobin/STAR) 5. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) 6. Prediction of editing sites using RDDpred ([`RDDpred`](https://github.com/vibbits/RDDpred)) 7. Extensive quality control: 1. [`RSeQC`](http://rseqc.sourceforge.net/) 2. [`Qualimap`](http://qualimap.bioinfo.cipf.es/) 3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html) 8. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) ## Quick Start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.04.0`) 2. Install [`Docker`](https://docs.docker.com/engine/installation/) on a Linux operating system. Note: This pipeline does not currently support running with macOS. 3. Download the pipeline via git clone, download the associated training data files for RDDpred into the assets folder, download the reference data to ```console git clone https://github.com/vibbits/rnaseq-editing.git cd $(pwd)/rnaseq-editing/assets # download training data file for RDDpred wget -c # download reference data for your genome, we provide genome and indexed genome for STAR 2.7.3a ``` > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. 4. Start running your own analysis using Docker locally! ```console nextflow run vibbits/rnaseq-editing \\ --input samplesheet.csv \\ --genome hg19 \\ -profile docker ``` * An executable Python script called [`fastq_dir_to_samplesheet.py`](https://github.com/nf-core/rnaseq/blob/master/bin/fastq_dir_to_samplesheet.py) has been provided if you would like to auto-create an input samplesheet based on a directory containing FastQ files **before** you run the pipeline (requires Python 3 installed locally) e.g. ```console wget -L https://raw.githubusercontent.com/nf-core/rnaseq/master/bin/fastq_dir_to_samplesheet.py ./fastq_dir_to_samplesheet.py samplesheet.csv --strandedness reverse ``` * The final analysis has been executed on the Azure platform using Azure Kubernetes Services (AKS). AKS has to be set up on the Azure platform by defining a standard node pool called sys next to the scalable node pool cpumem using Standard_E8ds_v4 as node size for calculation. Furthermore, persistent volume claims (PVCs) have been setup for input and work folders of the nextflow runs. In the PVC `input` the reference data as well as the fastqc files have been stored where the PVC `work`, the temporary nextflow files for the individual runs as well as the output files have been stored. * The config file for the final execution run for [RNAseq editing for the human samples and reference genome hg19](https://github.com/vibbits/rnaseq-editing/blob/master/nextflow.config.as-executed). ## Documentation The nf-core/rnaseq pipeline comes with documentation about the pipeline [usage](https://nf-co.re/rnaseq/usage), [parameters](https://nf-co.re/rnaseq/parameters) and [output](https://nf-co.re/rnaseq/output). ## Credits These scripts were written to provide a reproducible data analysis pipeline until the downstream processing using dedicated R scripts for exploratory analysis and plotting. The general structure of pipeline is based on the data analysis steps of the our recent paper [ADAR1 interaction with Z-RNA promotes editing of endogenous double-stranded RNA and prevents MDA5-dependent immune activation](https://pubmed.ncbi.nlm.nih.gov/34380029/). Note: The nf-core scripts this pipeline is based on were originally written for use at the [National Genomics Infrastructure](https://ngisweden.scilifelab.se), part of [SciLifeLab](http://www.scilifelab.se/) in Stockholm, Sweden, by Phil Ewels ([@ewels](https://github.com/ewels)) and Rickard Hammarén ([@Hammarn](https://github.com/Hammarn)). The RNAseq pipeline was re-written in Nextflow DSL2 by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London. ## Citations The `nf-core` publication is cited here as follows: > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:keywords "" ; ns1:license ; ns1:name "RNA sequencing data obtained from organisms with a reference genome and annotation followed by a prediction step of editing sites using RDDpred" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-01-27T16:12:12Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:34Z"^^ns1:Date ; ns1:description """# Snakemake workflow: FAIR CRCC - send data [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/crs4/fair-crcc-send-data/workflows/Tests/badge.svg?branch=main)](https://github.com/crs4/fair-crcc-send-data/actions?query=branch%3Amain+workflow%3ATests) A Snakemake workflow for securely sharing Crypt4GH-encrypted sensitive data from the [CRC Cohort](https://www.bbmri-eric.eu/scientific-collaboration/colorectal-cancer-cohort/) to a destination approved through a successful [access request](https://www.bbmri-eric.eu/services/access-policies/). The recommendation is to create a directory for the request that has been approved; it will be used as the working directory for the run. Copy there the recipient's crypt4gh key and prepare the run configuration. The configuration will specify the repository, the destination of the data, and the list of files/directories to transfer. ## What's the CRC Cohort? The CRC Cohort is a collection of clinical data and digital high-resolution digital pathology images pertaining to tumor cases. The collection has been assembled from a number of participating biobanks and other partners through the [ADOPT BBMRI-ERIC](https://www.bbmri-eric.eu/scientific-collaboration/adopt-bbmri-eric/) project. Researchers interested in using the data for science can file an application for access. If approved, the part of the dataset required for the planned and approved work can be copied to the requester's selected secure storage location (using this workflow). ## Usage ### Example mkdir request_1234 && cd request_1234 # Now write the configuration, specifying crypt4gh keys, destination and files to send. # Finally, execute workflow. snakemake --snakefile ../fair-crcc-send-data/workflow/Snakefile --profile ../profile/ --configfile config.yml --use-singularity --cores #### Run configuration example ``` recipient_key: ./recipient_key repository: path: "/mnt/rbd/data/sftp/fair-crcc/" private_key: bbmri-key public_key: bbmri-key.pub sources: glob_extension: ".tiff.c4gh" items: - some/directory/to/glob - another/individual/file.tiff.c4gh destination: type: "S3" root_path: "my-bucket/prefix/" connection: # all elements will be passed to the selected snakemake remote provider access_key_id: "MYACCESSKEY" secret_access_key: "MYSECRET" host: http://localhost:9000 verify: false # don't verify ssl certificates ``` TODO The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=crs4%2Ffair-crcc-send-data). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) fair-crcc-send-datasitory and its DOI (see above). """ ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "FAIR CRCC - send data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-01-31T11:03:37Z"^^ns1:Date ; ns1:dateModified "2025-03-14T02:02:34Z"^^ns1:Date ; ns1:description """# Snakemake workflow: FAIR CRCC - image conversion [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/crs4/fair-crcc-img-convert/workflows/Tests/badge.svg?branch=main)](https://github.com/crs4/fair-crcc-img-convert/actions?query=branch%3Amain+workflow%3ATests) A Snakemake workflow for converting whole-slide images (WSI) from the [CRC Cohort](https://www.bbmri-eric.eu/scientific-collaboration/colorectal-cancer-cohort/) from vendor-specific image formats to open image formats (at the moment, OME-TIFF). The workflow also encrypts the new image files with [Crypt4GH](https://doi.org/10.1093/bioinformatics/btab087). ## What's the CRC Cohort? The CRC Cohort is a collection of clinical data and digital high-resolution digital pathology images pertaining to tumor cases. The collection has been assembled from a number of participating biobanks and other partners through the [ADOPT BBMRI-ERIC](https://www.bbmri-eric.eu/scientific-collaboration/adopt-bbmri-eric/) project. Researchers interested in using the data for science can [apply for access](https://www.bbmri-eric.eu/services/access-policies/). ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=crs4%2Ffair-crcc-img-convert). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and its DOI (see above). """ ; ns1:keywords "" ; ns1:license ; ns1:name "FAIR CRCC - image conversion" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-02-08T11:36:16Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:57:29Z"^^ns1:Date ; ns1:description """ # Summary This notebook demonstrates how to recreate lineages published in the paper [Live imaging of remyelination in the adult mouse corpus callosum](https://www.pnas.org/content/118/28/e2025795118) and available at [idr0113-bottes-opcclones](https://idr.openmicroscopy.org/search/?query=Name:idr0113). The lineage is created from the metadata associated to the specified image. To load the data from the Image Data Resource, we use: * the [Python API](https://docs.openmicroscopy.org/omero/latest/developers/Python.html) * the [JSON API](https://docs.openmicroscopy.org/omero/latest/developers/json-api.html) LPC-induced focal demyelination and in vivo imaging of genetically targeted OPCs and their progeny to describe the cellular dynamics of OPC-mediated remyelination in the CC. Longitudinal observation of OPCs and their progeny for up to two months reveals functional inter- and intraclonal heterogeneity and provides insights into the cell division capacity and the migration/differentiation dynamics of OPCs and their daughter cells in vivo. The majority of the clones remained quiescent or divided only few times. Some OPCs were highly proliferative. Large clones showed longer times between consecutive divisions compared to low proliferating clones. OPCs show distinct modes of cell division: from symmetric proliferative, to symmetric differentiating and also asymmetric cell division, where the OPC is self-renewed while the other daughter cell differentiates. Only 16.46% of OPC-derived cells differentiated into mature, remyelinating oligodendrocytes, with OPCs born at early divisions showing a higher probability to survive and to terminally differentiate. Cell death was associated with distinct cell division histories of different clones, with higher probability of death when generated at later divisions. Migratory behaviour was restricted to progenitors. Successfully differentiating progenitors moved shorter distances per day compared to dying cells. # Inputs Parameters needed to configure the workflow: **imageId**: Identifier of an image in IDR. # Ouputs Output file generated: **lineage_imageId.pdf**: A PDF with the generated lineage. Options to save as `png` or `svg` are also available. """ ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Cell Lineage in the adult mouse corpus callosum" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2020-06-08T10:21:08Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:53Z"^^ns1:Date ; ns1:description """

VIRify

Sankey plot

VIRify is a recently developed pipeline for the detection, annotation, and taxonomic classification of viral contigs in metagenomic and metatranscriptomic assemblies. The pipeline is part of the repertoire of analysis services offered by MGnify. VIRify’s taxonomic classification relies on the detection of taxon-specific profile hidden Markov models (HMMs), built upon a set of 22,014 orthologous protein domains and referred to as ViPhOGs.

VIRify was implemented in CWL.

What do I need?

The current implementation uses CWL version 1.2 dev+2. It was tested using Toil version 4.10 as the workflow engine and conda to manage the software dependencies.

Docker - Singularity support

Soon…

Setup environment

conda env create -f cwl/requirements/conda_env.yml
conda activate viral_pipeline

Basic execution

cd cwl/
virify.sh -h

A note about metatranscriptomes

Although VIRify has been benchmarked and validated with metagenomic data in mind, it is also possible to use this tool to detect RNA viruses in metatranscriptome assemblies (e.g. SARS-CoV-2). However, some additional considerations for this purpose are outlined below:
1. Quality control: As for metagenomic data, a thorough quality control of the FASTQ sequence reads to remove low-quality bases, adapters and host contamination (if appropriate) is required prior to assembly. This is especially important for metatranscriptomes as small errors can further decrease the quality and contiguity of the assembly obtained. We have used TrimGalore for this purpose.

2. Assembly: There are many assemblers available that are appropriate for either metagenomic or single-species transcriptomic data. However, to our knowledge, there is no assembler currently available specifically for metatranscriptomic data. From our preliminary investigations, we have found that transcriptome-specific assemblers (e.g. rnaSPAdes) generate more contiguous and complete metatranscriptome assemblies compared to metagenomic alternatives (e.g. MEGAHIT and metaSPAdes).

3. Post-processing: Metatranscriptomes generate highly fragmented assemblies. Therefore, filtering contigs based on a set minimum length has a substantial impact in the number of contigs processed in VIRify. It has also been observed that the number of false-positive detections of VirFinder (one of the tools included in VIRify) is lower among larger contigs. The choice of a length threshold will depend on the complexity of the sample and the sequencing technology used, but in our experience any contigs <2 kb should be analysed with caution.

4. Classification: The classification module of VIRify depends on the presence of a minimum number and proportion of phylogenetically-informative genes within each contig in order to confidently assign a taxonomic lineage. Therefore, short contigs typically obtained from metatranscriptome assemblies remain generally unclassified. For targeted classification of RNA viruses (for instance, to search for Coronavirus-related sequences), alternative DNA- or protein-based classification methods can be used. Two of the possible options are: (i) using MashMap to screen the VIRify contigs against a database of RNA viruses (e.g. Coronaviridae) or (ii) using hmmsearch to screen the proteins obtained in the VIRify contigs against marker genes of the taxon of interest.

Contact us

MGnify helpdesk""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.26.1" ; ns1:image ; ns1:input , , , , , , , , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "VIRify" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Alexandre Almeida" . a ns1:Person ; ns1:name "Guillermo Rangel-Pineros and Ekaterina Sakharova" . a ns1:Person ; ns1:name "Martin Hölzer" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "S2B_MSIL2A_20200626T095029_N0214_R079_T34VFN_20200626T123234_tar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sentinel2_tiles_world" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "test_parcels_32635" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-03-11T12:32:55Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:57:33Z"^^ns1:Date ; ns1:description """This workflow demonstrates the usage of EODIE, a toolkit to extract object based timeseries information from Earth Observation data. EODIE is a toolkit to extract object based timeseries information from Earth Observation data. The EODIE code can be found on [Gitlab](https://gitlab.com/fgi_nls/public/EODIE) . The goal of EODIE is to ease the extraction of time series information at object level. Today, vast amounts of Earth Observation data are available to the users via for example earth explorer or scihub. Often, not the whole images are needed for exploitation, but only the timeseries of a certain feature on object level. Objects may be polygons depicting agricultural field parcels, forest plots, or areas of a certain land cover type. EODIE takes the objects in as polygons in a shapefile as well as the timeframe of interest and the features (eg vegetation indices) to be extracted. The output is a per polygon timeseries of the selected features over the timeframe of interest. **Online documentation** EODIE documentation can be found [here](https://eodie.readthedocs.io/en/latest/). **Abstract CWL** Automatically generated from the Galaxy workflow file: Workflow constructed from history 'EODIE Sentinel'""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.274.1" ; ns1:image ; ns1:input , , ; ns1:keywords "earth observation, copernicus, ndvi, sentinel-2 data" ; ns1:license ; ns1:name "Galaxy workflow demonstrating the usage of EODIE Galaxy Tool" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T10:09:38Z"^^ns1:Date ; ns1:dateModified "2026-03-24T10:12:24Z"^^ns1:Date ; ns1:description """# Protein MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.276.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myeditconf.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfix_side_chain.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_energy.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_image.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_rgyr.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_rms.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_trjconv_str.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygrompp.tpr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.cpt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.edr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.zip" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:41:05Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:43:16Z"^^ns1:Date ; ns1:description """# Protein MD Setup tutorial using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/gmx-protein-md-setup). *** **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.277.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Protein MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:Person ; ns1:name "Daniel López-López" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bits_set" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blacklist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bwt_algorithm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "codex_max_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "codex_min_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "codex_min_lratio" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "enable_codex" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "enable_exomeDepth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "enable_gridss" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "enable_manta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "exomeDepth_max_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "exomeDepth_min_bf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "exomeDepth_min_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "generate_bwa_indexes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gridss_max_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gridss_min_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gridss_min_q" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "manta_exome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "manta_max_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "manta_min_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "manta_min_q" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_mapping_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "read_group" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_amb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_ann" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_bwt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_fai" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_pac" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_sa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_bwa_mem" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_fastp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_fastqc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_gridss" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_samtools" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastqc_paired_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastqc_paired_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastqc_raw_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastqc_raw_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "json_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_all" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_bam_filtering" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_codex" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_exomedepth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gridss" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_manta" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-10-11T11:04:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:57:59Z"^^ns1:Date ; ns1:description """# StructuralVariants Workflow """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "cancer, CODEX2, ExomeDepth, manta, TransBioNet, variant calling, GRIDSS, structural variants" ; ns1:license ; ns1:name "CNV_pipeline" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 11 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_ndx_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_edr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpt_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_str_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_molecule_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_xvg_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_top_zip_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_tpr_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T09:34:44Z"^^ns1:Date ; ns1:dateModified "2026-03-24T09:51:08Z"^^ns1:Date ; ns1:description """# Protein MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.279.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2020-06-08T10:29:47Z"^^ns1:Date ; ns1:dateModified "2023-03-24T16:47:02Z"^^ns1:Date ; ns1:description """

VIRify

Sankey plot

VIRify is a recently developed pipeline for the detection, annotation, and taxonomic classification of viral contigs in metagenomic and metatranscriptomic assemblies. The pipeline is part of the repertoire of analysis services offered by MGnify. VIRify’s taxonomic classification relies on the detection of taxon-specific profile hidden Markov models (HMMs), built upon a set of 22,014 orthologous protein domains and referred to as ViPhOGs.

VIRify was implemented in CWL.

What do I need?

The current implementation uses CWL version 1.2 dev+2. It was tested using Toil version 4.10 as the workflow engine and conda to manage the software dependencies.

Docker - Singularity support

Soon…

Setup environment

conda env create -f cwl/requirements/conda_env.yml
conda activate viral_pipeline

Basic execution

cd cwl/
virify.sh -h

A note about metatranscriptomes

Although VIRify has been benchmarked and validated with metagenomic data in mind, it is also possible to use this tool to detect RNA viruses in metatranscriptome assemblies (e.g. SARS-CoV-2). However, some additional considerations for this purpose are outlined below:
1. Quality control: As for metagenomic data, a thorough quality control of the FASTQ sequence reads to remove low-quality bases, adapters and host contamination (if appropriate) is required prior to assembly. This is especially important for metatranscriptomes as small errors can further decrease the quality and contiguity of the assembly obtained. We have used TrimGalore for this purpose.

2. Assembly: There are many assemblers available that are appropriate for either metagenomic or single-species transcriptomic data. However, to our knowledge, there is no assembler currently available specifically for metatranscriptomic data. From our preliminary investigations, we have found that transcriptome-specific assemblers (e.g. rnaSPAdes) generate more contiguous and complete metatranscriptome assemblies compared to metagenomic alternatives (e.g. MEGAHIT and metaSPAdes).

3. Post-processing: Metatranscriptomes generate highly fragmented assemblies. Therefore, filtering contigs based on a set minimum length has a substantial impact in the number of contigs processed in VIRify. It has also been observed that the number of false-positive detections of VirFinder (one of the tools included in VIRify) is lower among larger contigs. The choice of a length threshold will depend on the complexity of the sample and the sequencing technology used, but in our experience any contigs <2 kb should be analysed with caution.

4. Classification: The classification module of VIRify depends on the presence of a minimum number and proportion of phylogenetically-informative genes within each contig in order to confidently assign a taxonomic lineage. Therefore, short contigs typically obtained from metatranscriptome assemblies remain generally unclassified. For targeted classification of RNA viruses (for instance, to search for Coronavirus-related sequences), alternative DNA- or protein-based classification methods can be used. Two of the possible options are: (i) using MashMap to screen the VIRify contigs against a database of RNA viruses (e.g. Coronaviridae) or (ii) using hmmsearch to screen the proteins obtained in the VIRify contigs against marker genes of the taxon of interest.

Contact us

MGnify helpdesk""" ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "VIRify" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IndexName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "alignments_are_sorted" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bankfile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "barcode_tag" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "base_correction" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bonferroni" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bq2_handling" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "call_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cancer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cancerSamples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "canon" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "classic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "comment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "csvFile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "def_alt_bq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "def_alt_jq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "defqual" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "del_baq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "disable_trim_poly_g" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "duplicate_scoring_strategy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "empty_text" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "enable_source_qual" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "exclude_unmapped" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "extractFields" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filterInterval" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "force_polyg_tail_trimming" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "formatEff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "geneId" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "genome_reference" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hgvs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "html_report_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ignore_vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "illumina_1_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "importGenome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "interval" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "keepflags" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lof" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "max_depth_cov" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "max_mapping_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_alt_bq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_alt_jq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_bq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_cov" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_jq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_length_required" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min_mq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "motif" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nextProt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noGenome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noHgvs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noLof" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noMotif" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noNextProt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noShiftHgvs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "noStats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_EffectType" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_baq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_default_filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_downstream" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_ext_base_alignment_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_idaq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_intergenic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_intron" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_mapping_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_upstream" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_utr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "oicr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "onlyProtein" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "onlyReg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "only_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "optical_duplicate_pixel_distance" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "outputFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pvalue_cutoff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qualified_phred_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reads_forward" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reads_reverse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_in" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "region" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "remove_duplicates" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "replace_non_match" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "separator" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sequenceOntology" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sort_order" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "spliceRegionExonSize" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "spliceRegionIntronMax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "spliceRegionIntronMin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "spliceSiteSize" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "strict" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads_lf_call" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "transcripts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "udLength" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "unqualified_phred_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "use_orphan" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "validation_stringency" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "validation_stringency_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_fastp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_markdups" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_samtoolsstats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_snpsift" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "statsFile_snpeff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stats_bam" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T12:36:41Z"^^ns1:Date ; ns1:dateModified "2026-03-24T12:38:51Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.280.6" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python GMX Automatic Ligand Parameterization tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T14:30:42Z"^^ns1:Date ; ns1:dateModified "2026-03-24T14:32:37Z"^^ns1:Date ; ns1:description """# Protein Ligand Complex MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/complex/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation system** containing a **protein in complex with a ligand**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **T4 lysozyme** L99A/M102Q protein (PDB code 3HTB), in complex with the **2-propylphenol** small molecule (3-letter Code JZ4). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.281.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein Ligand Complex MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T15:28:55Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:31:35Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.282.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein-ligand Docking tutorial (Fpocket)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T10:14:05Z"^^ns1:Date ; ns1:dateModified "2026-03-26T10:16:07Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.283.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Amber Protein MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:23:14Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:25:26Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.284.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Amber Protein Ligand Complex MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T11:55:41Z"^^ns1:Date ; ns1:dateModified "2026-03-26T11:57:47Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.285.5" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python ABC MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T14:02:53Z"^^ns1:Date ; ns1:dateModified "2026-03-26T14:04:55Z"^^ns1:Date ; ns1:description """# Structural DNA helical parameters from MD trajectory tutorial using BioExcel Building Blocks (biobb) **Based on the [NAFlex](https://mmb.irbbarcelona.org/NAFlex) server and in particular in its [Nucleic Acids Analysis section](https://mmb.irbbarcelona.org/NAFlex/help.php?id=tutorialAnalysisNA).** *** This tutorial aims to illustrate the process of **extracting structural and dynamical properties** from a **DNA MD trajectory helical parameters**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Drew Dickerson Dodecamer** sequence -CGCGAATTCGCG- (PDB code [1BNA](https://www.rcsb.org/structure/1BNA)). The trajectory used is a 500ns-long MD simulation taken from the [BigNASim](https://mmb.irbbarcelona.org/BIGNASim/) database ([NAFlex_DDD_II](https://mmb.irbbarcelona.org/BIGNASim/getStruc.php?idCode=NAFlex_DDD_II) entry). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.286.6" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Structural DNA helical parameters tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T14:53:54Z"^^ns1:Date ; ns1:dateModified "2026-03-26T14:55:41Z"^^ns1:Date ; ns1:description """# Protein MD Analysis tutorial using BioExcel Building Blocks (biobb) *** This workflow computes a set of Quality Control (QC) analyses on top of an uploaded MD trajectory. QC analyses include positional divergence (RMSd), change of shape (Radius of Gyration), identification of flexible regions (atomic/residue fluctuations), and identification of different molecular conformations (trajectory clustering). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.287.4" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein MD Analysis tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of mutations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Collected Simulation Data" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T14:39:54Z"^^ns1:Date ; ns1:dateModified "2026-03-27T08:19:23Z"^^ns1:Date ; ns1:description """# Mutations Protein MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.289.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein MD Setup tutorial with mutations" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-06-08T14:57:04Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:41:56Z"^^ns1:Date ; ns1:description """Analysis of variation within individual COVID-19 samples using bowtie2, bwa, fastp, multiqc , picard ,samtools, snpEff Workflow, tools and data are available on https://github.com/fjrmoreews/cwl-workflow-SARS-CoV-2/tree/master/Variation This worklow was ported into CWL from a Galaxy Workflow ( https://github.com/galaxyproject/SARS-CoV-2/tree/master/genomics/4-Variation migrated to CWL). """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "variation, CWL, covid-19" ; ns1:license ; ns1:name "var-PE" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step10_energy_min_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step10_energy_min_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step11_gppnvt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step13_energy_nvt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step13_energy_nvt_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step14_gppnpt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step16_energy_npt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step16_energy_npt_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step17_gppmd_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step19_rmsfirst_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step19_rmsfirst_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_pdb_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_pdb_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step20_rmsexp_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step20_rmsexp_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step21_rgyr_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step22_image_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step23_dry_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step4_editconf_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step6_gppion_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step7_genion_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step8_gppmin_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Checkpoint file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Structures - Raw structure" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Resulting protein structure" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GROMACS topology file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Topologies GROMACS portable binary run" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trajectories - Raw trajectory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trajectories - Post-processed trajectory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "System Setup Observables - Potential Energy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "System Setup Observables - Pressure and density" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "System Setup Observables - Temperature" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Simulation Analysis" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Simulation Analysis" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Simulation Analysis" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-26T14:46:43Z"^^ns1:Date ; ns1:dateModified "2026-03-26T14:48:26Z"^^ns1:Date ; ns1:description """# Mutations Protein MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Lysozyme** protein (PDB code 1AKI). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.290.4" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein MD Setup tutorial with mutations" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-03-24T14:20:27Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:58:35Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2022 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2022 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.291.2" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python GMX OPLS/AA Automatic Ligand Parameterization tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-03-24T14:21:22Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:58:35Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2022 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2022 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.292.2" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Amber Automatic Ligand Parameterization tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-03-24T14:22:19Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:58:35Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2022 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2022 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.293.2" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python CNS/XPLOR MD Automatic Ligand Parameterization tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.itp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybabel_minimize.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myligand.pdb" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:44:01Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:45:13Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/gmx-ligand-parameterization). *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.294.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy GMX Automatic Ligand Parameterization tutorial" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.itp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_gmx.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myappend_ligand.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybabel_minimize.mol2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycat_pdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myeditconf.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_heteroatoms.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_molecule.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfix_side_chain.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenion.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygenrestr.itp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_energy.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_image.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_rgyr.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_rms_exp.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_trjconv_str_lig.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_trjconv_str_prot.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygrompp_ion.tpr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymake_ndx.ndx" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.cpt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.edr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xtc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymdrun.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb2gmx.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb_prot.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myreduce_add_hydrogens.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysolvate.zip" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:46:26Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:47:32Z"^^ns1:Date ; ns1:description """# Protein Ligand Complex MD Setup tutorial using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/gmx-protein-ligand-complex-md-setup). *** **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/complex/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation system** containing a **protein in complex with a ligand**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **T4 lysozyme** L99A/M102Q protein (PDB code 3HTB), in complex with the **2-propylphenol** small molecule (3-letter Code JZ4). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2022 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2022 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.295.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Protein Ligand Complex MD Setup" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myautodock_vina_run.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myautodock_vina_run.pdbqt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybabel_convert.ent" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybox.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycat_pdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_model_pdbqt.pdbqt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_molecule.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfpocket_filter.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfpocket_run.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfpocket_run.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfpocket_select.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myfpocket_select.pqr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myideal_sdf.sdf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mystr_check_add_hydrogens.pdb" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:48:26Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:49:48Z"^^ns1:Date ; ns1:description """# Protein-ligand Docking tutorials using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/protein-ligand-docking). *** This tutorials aim to illustrate the process of **protein-ligand docking**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular examples used are based on the **Mitogen-activated protein kinase 14** (p38-α) protein (PDB code [3HEC](https://www.rcsb.org/structure/3HEC)), a well-known **Protein Kinase enzyme**, in complex with the FDA-approved **Imatinib** (PDB Ligand code [STI](https://www.rcsb.org/ligand/STI), DrugBank Ligand Code [DB00619](https://go.drugbank.com/drugs/DB00619)) and **Dasatinib** (PDB Ligand code [1N1](https://www.rcsb.org/ligand/1N1), DrugBank Ligand Code [DB01254](https://go.drugbank.com/drugs/DB01254)), small **kinase inhibitors** molecules used to treat certain types of **cancer**. The tutorials will guide you through the process of identifying the **active site cavity** (pocket) without previous knowledge, and the final prediction of the **protein-ligand complex**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.296.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Protein-ligand Docking tutorial (Fpocket)" ; ns1:output , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myamber_to_pdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_image.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_rgyr.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_rms.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb4amber_run.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_mdout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_minout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cpout" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cprst" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.mdinfo" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.netcdf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.rst" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.trj" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:49:59Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:51:37Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/amber-protein-md-setup). *** **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.297.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Amber Protein MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_ac.frcmod" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_ac.inpcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_ac.lib" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myacpype_params_ac.prmtop" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myamber_to_pdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybabel_minimize.mol2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_image.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_rgyr.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_rms.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_heteroatoms.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.parmtop" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.parmtop" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb4amber_run.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_mdout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_minout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myreduce_add_hydrogens.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myremove_ligand.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myremove_pdb_water.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cpout" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cprst" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.mdinfo" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.netcdf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.rst" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.trj" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:52:05Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:53:54Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/amber-protein-ligand-complex-md-setup). *** **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.298.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Amber Protein Ligand Complex MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_42" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_43" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_44" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_45" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_46" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_47" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_48" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_49" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_50" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_51" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_52" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_53" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_54" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_55" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_56" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_57" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_58" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_59" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_60" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_61" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_62" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_63" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_64" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_65" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_66" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_67" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_68" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_randomize_ions.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_randomize_ions.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.parmtop" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_add_ions.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_gen_top.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.parmtop" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myleap_solvate.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myparmed_hmassrepartition.top" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_mdout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprocess_minout.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cpout" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.cprst" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.mdinfo" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.nc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mysander_mdrun.ncrst" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-03T13:54:07Z"^^ns1:Date ; ns1:dateModified "2023-05-03T13:55:38Z"^^ns1:Date ; ns1:description """# AMBER Protein MD Setup tutorials using BioExcel Building Blocks (biobb) *** ## This workflow must be run in **biobb.usegalaxy.es**. Please, [click here to access](https://biobb.usegalaxy.es/u/gbayarri/w/abcix-md-setup). *** **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation** system containing a **protein**, step by step, using the **BioExcel Building Blocks library (biobb)** wrapping the **Ambertools MD package**. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.299.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy ABC MD Setup tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2021-05-10T09:00:56Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:42:00Z"^^ns1:Date ; ns1:description "Common Workflow Language example that illustrate the process of setting up a simulation system containing a protein, step by step, using the [BioExcel Building Blocks](/projects/11) library (biobb). The particular example used is the Lysozyme protein (PDB code 1AKI). This workflow returns a resulting protein structure and simulated 3D trajectories." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.29.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "molecular dynamics, trajectories, protein" ; ns1:license ; ns1:name "Protein MD Setup tutorial using BioExcel Building Blocks (biobb) in CWL" ; ns1:output , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "auto_kmer_choice" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "careful" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cov_cutoff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cov_state" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq1_type" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq2_type" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq_file_type" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "iontorrent" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kmers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "libraries_fwd_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "libraries_metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "libraries_mono" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mode" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nanopore_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "onlyassembler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pacbio_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sanger_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trusted_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "untrusted_contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "all_log_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_graph_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_graph_unicycler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_graph_with_scaffolds_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_image_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_image_unicycler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_info_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_info_unicycler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assembly_unicycler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_contig_stats_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_contigs_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_scaffold_stats_spades" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_scaffolds_spades" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indices_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_left_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_right_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sars_cov_2_reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snps" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-03-29T16:10:54Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:09Z"^^ns1:Date ; ns1:description """# eQTL-Catalogue/qtlmap **Portable eQTL analysis and statistical fine mapping workflow used by the eQTL Catalogue** ### Introduction **eQTL-Catalogue/qtlmap** is a bioinformatics analysis pipeline used for QTL Analysis. The workflow takes phenotype count matrix (normalized and quality controlled) and genotype data as input, and finds associations between them with the help of sample metadata and phenotype metadata files (See [Input formats and preparation](docs/inputs_expl.md) for required input file details). To map QTLs, pipeline uses [QTLTools's](https://qtltools.github.io/qtltools/) PCA and RUN methods. For manipulation of files [BcfTools](https://samtools.github.io/bcftools/bcftools.html), [Tabix](http://www.htslib.org/doc/tabix.html) and custom [Rscript](https://www.rdocumentation.org/packages/utils/versions/3.5.3/topics/Rscript) scripts are used. The pipeline is built using [Nextflow](https://www.nextflow.io), a bioinformatics workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible. ### Documentation The eQTL-Catalogue/qtlmap pipeline comes with documentation about the pipeline, found in the `docs/` directory: 1. [Installation](docs/installation.md) 2. Pipeline configuration * [Local installation](docs/configuration/local.md) * [Adding your own system](docs/configuration/adding_your_own.md) 3. [Input formats and preparation](docs/inputs_expl.md) 4. [Running the pipeline](docs/usage.md) 5. [Troubleshooting](docs/troubleshooting.md) ### Pipeline Description Mapping QTLs is a process of finding statistically significant associations between phenotypes and genetic variants located nearby (within a specific window around phenotype, a.k.a cis window) This pipeline is designed to perform QTL mapping. It is intended to add this pipeline to the nf-core framework in the future. High level representation of the pipeline is shown below: ### Results The output directory of the workflow contains the following subdirectories: 1. PCA - genotype and gene expression PCA values used as covariates for QTL analysis. 2. sumstats - QTL summary statistics from nominal and permutation passes. 3. susie - SuSiE fine mapping credible sets. 4. susie_full - full set of susie results for all tested variants (very large files). 5. susie_merged - susie credible sets merged with summary statistics from univariate QTL analysis. Column names of the output files are explained [here](https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/tabix/Columns.md). # Contributors * Nurlan Kerimov * Kaur Alasoo * Masahiro Kanai * Ralf Tambets """ ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "eQTL-Catalogue/qtlmap" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-06-10T18:38:49Z"^^ns1:Date ; ns1:dateModified "2024-06-10T18:39:57Z"^^ns1:Date ; ns1:description """ ![Logo](https://cbg-ethz.github.io/V-pipe/img/logo.svg) [![bio.tools](https://img.shields.io/badge/bio-tools-blue.svg)](https://bio.tools/V-Pipe) [![Snakemake](https://img.shields.io/badge/snakemake-≥7.11.0-blue.svg)](https://snakemake.github.io/snakemake-workflow-catalog/?usage=cbg-ethz/V-pipe) [![Deploy Docker image](https://github.com/cbg-ethz/V-pipe/actions/workflows/deploy-docker.yaml/badge.svg)](https://github.com/cbg-ethz/V-pipe/pkgs/container/v-pipe) [![Tests](https://github.com/cbg-ethz/V-pipe/actions/workflows/run_regression_tests.yaml/badge.svg)](https://github.com/cbg-ethz/V-pipe/actions/workflows/run_regression_tests.yaml) [![Mega-Linter](https://github.com/cbg-ethz/V-pipe/actions/workflows/mega-linter.yml/badge.svg)](https://github.com/cbg-ethz/V-pipe/actions/workflows/mega-linter.yml) [![License: Apache-2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) V-pipe is a workflow designed for the analysis of next generation sequencing (NGS) data from viral pathogens. It produces a number of results in a curated format (e.g., consensus sequences, SNV calls, local/global haplotypes). V-pipe is written using the Snakemake workflow management system. ## Usage Different ways of initializing V-pipe are presented below. We strongly encourage you to deploy it [using the quick install script](#using-quick-install-script), as this is our preferred method. To configure V-pipe refer to the documentation present in [config/README.md](config/README.md). V-pipe expects the input samples to be organized in a [two-level](config/README.md#samples) directory hierarchy, and the sequencing reads must be provided in a sub-folder named `raw_data`. Further details can be found on the [website](https://cbg-ethz.github.io/V-pipe/usage/). Check the utils subdirectory for [mass-importers tools](utils/README.md#samples-mass-importers) that can assist you in generating this hierarchy. We provide [virus-specific base configuration files](config/README.md#virus-base-config) which contain handy defaults for, e.g., HIV and SARS-CoV-2. Set the virus in the general section of the configuration file: ```yaml general: virus_base_config: hiv ``` Also see [snakemake's documentation](https://snakemake.readthedocs.io/en/stable/executing/cli.html) to learn more about the command-line options available when executing the workflow. ### Tutorials Tutorials for your first steps with V-pipe for different scenarios are available in the [docs/](docs/README.md) subdirectory. ### Using quick install script To deploy V-pipe, use the [installation script](utils/README.md#quick-installer) with the following parameters: ```bash curl -O 'https://raw.githubusercontent.com/cbg-ethz/V-pipe/master/utils/quick_install.sh' ./quick_install.sh -w work ``` This script will download and install miniconda, checkout the V-pipe git repository (use `-b` to specify which branch/tag) and setup a work directory (specified with `-w`) with an executable script that will execute the workflow: ```bash cd work # edit config.yaml and provide samples/ directory ./vpipe --jobs 4 --printshellcmds --dry-run ``` Test data to test your installation is available with the tutorials provided in the [docs/](docs/README.md) subdirectory. ### Using Docker Note: the [docker image](https://github.com/cbg-ethz/V-pipe/pkgs/container/v-pipe) is only setup with components to run the workflow for HIV and SARS-CoV-2 virus base configurations. Using V-pipe with other viruses or configurations might require internet connectivity for additional software components. Create `config.yaml` or `vpipe.config` and then populate the `samples/` directory. For example, the following config file could be used: ```yaml general: virus_base_config: hiv output: snv: true local: true global: false visualization: true QA: true ``` Then execute: ```bash docker run --rm -it -v $PWD:/work ghcr.io/cbg-ethz/v-pipe:master --jobs 4 --printshellcmds --dry-run ``` ### Using Snakedeploy First install [mamba](https://github.com/conda-forge/miniforge#mambaforge), then create and activate an environment with Snakemake and Snakedeploy: ```bash mamba create -c conda-forge -c bioconda --name snakemake snakemake snakedeploy conda activate snakemake ``` Snakemake's [official workflow installer Snakedeploy](https://snakemake.github.io/snakemake-workflow-catalog/?usage=cbg-ethz/V-pipe) can now be used: ```bash snakedeploy deploy-workflow https://github.com/cbg-ethz/V-pipe --tag master . # edit config/config.yaml and provide samples/ directory snakemake --use-conda --jobs 4 --printshellcmds --dry-run ``` ## Dependencies - **[Conda](https://conda.io/docs/index.html)** Conda is a cross-platform package management system and an environment manager application. Snakemake uses mamba as a package manager. - **[Snakemake](https://snakemake.readthedocs.io/)** Snakemake is the central workflow and dependency manager of V-pipe. It determines the order in which individual tools are invoked and checks that programs do not exit unexpectedly. - **[VICUNA](https://www.broadinstitute.org/viral-genomics/vicuna)** VICUNA is a _de novo_ assembly software designed for populations with high mutation rates. It is used to build an initial reference for mapping reads with ngshmmalign aligner when a `references/cohort_consensus.fasta` file is not provided. Further details can be found in the [wiki](https://github.com/cbg-ethz/V-pipe/wiki/getting-started#input-files) pages. ### Computational tools Other dependencies are managed by using isolated conda environments per rule, and below we list some of the computational tools integrated in V-pipe: - **[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** FastQC gives an overview of the raw sequencing data. Flowcells that have been overloaded or otherwise fail during sequencing can easily be determined with FastQC. - **[PRINSEQ](http://prinseq.sourceforge.net/)** Trimming and clipping of reads is performed by PRINSEQ. It is currently the most versatile raw read processor with many customization options. - **[ngshmmalign](https://github.com/cbg-ethz/ngshmmalign)** We perform the alignment of the curated NGS data using our custom ngshmmalign that takes structural variants into account. It produces multiple consensus sequences that include either majority bases or ambiguous bases. - **[bwa](https://github.com/lh3/bwa)** In order to detect specific cross-contaminations with other probes, the Burrows-Wheeler aligner is used. It quickly yields estimates for foreign genomic material in an experiment. Additionally, It can be used as an alternative aligner to ngshmmalign. - **[MAFFT](http://mafft.cbrc.jp/alignment/software/)** To standardise multiple samples to the same reference genome (say HXB2 for HIV-1), the multiple sequence aligner MAFFT is employed. The multiple sequence alignment helps in determining regions of low conservation and thus makes standardisation of alignments more robust. - **[Samtools and bcftools](https://www.htslib.org/)** The Swiss Army knife of alignment postprocessing and diagnostics. bcftools is also used to generate consensus sequence with indels. - **[SmallGenomeUtilities](https://github.com/cbg-ethz/smallgenomeutilities)** We perform genomic liftovers to standardised reference genomes using our in-house developed python library of utilities for rewriting alignments. - **[ShoRAH](https://github.com/cbg-ethz/shorah)** ShoRAh performs SNV calling and local haplotype reconstruction by using bayesian clustering. - **[LoFreq](https://csb5.github.io/lofreq/)** LoFreq (version 2) is SNVs and indels caller from next-generation sequencing data, and can be used as an alternative engine for SNV calling. - **[SAVAGE](https://bitbucket.org/jbaaijens/savage) and [Haploclique](https://github.com/cbg-ethz/haploclique)** We use HaploClique or SAVAGE to perform global haplotype reconstruction for heterogeneous viral populations by using an overlap graph. ## Citation If you use this software in your research, please cite: Fuhrmann, L., Jablonski, K. P., Topolsky, I., Batavia, A. A., Borgsmueller, N., Icer Baykal, P., Carrara, M. ... & Beerenwinkel, (2023). "V-Pipe 3.0: A Sustainable Pipeline for Within-Sample Viral Genetic Diversity Estimation." _bioRxiv_, doi:[10.1101/2023.10.16.562462](https://doi.org/10.1101/2023.10.16.562462). ## Contributions - [Ivan Topolsky\\* ![orcid]](https://orcid.org/0000-0002-7561-0810), [![github]](https://github.com/dryak) - [Pelin Icer Baykal ![orcid]](https://orcid.org/0000-0002-9542-5292), [![github]](https://github.com/picerbaykal) - [Kim Philipp Jablonski ![orcid]](https://orcid.org/0000-0002-4166-4343), [![github]](https://github.com/kpj) - [Lara Fuhrmann ![orcid]](https://orcid.org/0000-0001-6405-0654), [![github]](https://github.com/LaraFuhrmann) - [Uwe Schmitt ![orcid]](https://orcid.org/0000-0002-4658-0616), [![github]](https://github.com/uweschmitt) - [Michal Okoniewski ![orcid]](https://orcid.org/0000-0003-4722-4506), [![github]](https://github.com/michalogit) - [Monica Dragan ![orcid]](https://orcid.org/0000-0002-7719-5892), [![github]](https://github.com/monicadragan) - [Susana Posada Céspedes ![orcid]](https://orcid.org/0000-0002-7459-8186), [![github]](https://github.com/sposadac) - [David Seifert ![orcid]](https://orcid.org/0000-0003-4739-5110), [![github]](https://github.com/SoapZA) - Tobias Marschall - [Niko Beerenwinkel\\*\\* ![orcid]](https://orcid.org/0000-0002-0573-6119) \\* software maintainer ; \\** group leader [github]: https://cbg-ethz.github.io/V-pipe/img/mark-github.svg [orcid]: https://cbg-ethz.github.io/V-pipe/img/ORCIDiD_iconvector.svg ## Contact We encourage users to use the [issue tracker](https://github.com/cbg-ethz/V-pipe/issues). For further enquiries, you can also contact the V-pipe Dev Team . """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.301.5" ; ns1:isBasedOn ; ns1:keywords "Alignment, Assembly, covid-19, Genomics, INDELs, rna, SNPs, variant_calling, workflow" ; ns1:license ; ns1:name "V-pipe (main multi-virus version)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "barcodes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "binning_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "block_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clip_max" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clusterDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureSelectionDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "features" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filterDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "findNeighborsR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "find_markersR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "k" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loadDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loess_span" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "margin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "matrix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minCells" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minFeatures" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nCountRNAmax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nCountRNAmin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nFeatureRNAmax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "nFeatureRNAmin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "neighbors_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "normalization_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "normalizeDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "num_bin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "num_features" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "percentMTmax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "percentMTmin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "projectName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runPCAR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runTSNER" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runUmapR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "running_step" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "scaleDataR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "scale_factor" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "selection_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "verbose" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clusterDataOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filterDataOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filterDataPlot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "findFeaturesOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "findFeaturesPlot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "findMarkersOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "findNeighborsOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loadDataOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loadDataPlot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "normalizeDataOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runPCAOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runPCAPlot1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runPCAPlot2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runPCAPlot3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runTSNEOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runUMAPOutput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "runUMAPOutputPlot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "scaleDataOutput" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-04-14T13:28:34Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:13Z"^^ns1:Date ; ns1:description "" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "seurat scRNA-seq" ; ns1:output , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Collection of Pacbio Data" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-05T11:22:34Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:21Z"^^ns1:Date ; ns1:description "Create Meryl Database used for the estimation of assembly parameters and quality control with Merqury. Part of the VGP pipeline." ; ns1:input ; ns1:keywords "vgp, Galaxy, Assembly" ; ns1:license ; ns1:name "VGP genome profile analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-06-17T06:11:59Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:42:19Z"^^ns1:Date ; ns1:description """ Author: AMBARISH KUMAR er.ambarish@gmail.com & ambari73_sit@jnu.ac.in This is a proposed standard operating procedure for genomic variant detection using GATK4. It is hoped to be effective and useful for getting SARS-CoV-2 genome variants. It uses Illumina RNASEQ reads and genome sequence. """ ; ns1:image ; ns1:input , , , ; ns1:keywords "CWL, GATK4, SNPs, INDELs" ; ns1:license ; ns1:name "Genomic variants - SNPs and INDELs detection using GATK4." ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_left_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_right_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sars_cov_2_reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snps" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-05T21:55:13Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:26Z"^^ns1:Date ; ns1:description "Performs Long Read assembly using PacBio data and Hifiasm. Part of VGP assembly pipeline. This workflow generate a phased assembly." ; ns1:input , , , ; ns1:keywords "vgp, Assembly, Galaxy" ; ns1:license ; ns1:name "VGP HiFi phased assembly with hifiasm and HiC data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-06-17T06:24:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:42:27Z"^^ns1:Date ; ns1:description """ Author: AMBARISH KUMAR er.ambarish@gmail.com; ambari73_sit@jnu.ac.in This is a proposed standard operating procedure for genomic variant detection using VARSCAN. It is hoped to be effective and useful for getting SARS-CoV-2 genome variants. It uses Illumina RNASEQ reads and genome sequence. """ ; ns1:image ; ns1:input , , , ; ns1:keywords "CWL, SNPs, INDELs, VARSCAN2" ; ns1:license ; ns1:name "Genomic variants - SNPs and INDELs detection using VARSCAN2." ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope model parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection - Trimmed" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-05T22:16:05Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:31Z"^^ns1:Date ; ns1:description "Purge Phased assembly of duplications and overlaps. Include purge steps for Primary and Alternate assemblies." ; ns1:image ; ns1:input , , , ; ns1:keywords "vgp, Assembly, Galaxy" ; ns1:license ; ns1:name "VGP purge assembly with purge_dups pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bionano Data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Conflict resolution files " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size - Parameter File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Purged Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio trimmed reads" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-05T22:21:34Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:34Z"^^ns1:Date ; ns1:description "Performs scaffolding using Bionano Data. Part of VGP assembly pipeline." ; ns1:input , , , , ; ns1:isPartOf ; ns1:keywords "vgp, Assembly, Galaxy" ; ns1:license ; ns1:name "VGP hybrid scaffolding with Bionano optical maps" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size - Parameter File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC Forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Restriction enzyme sequences " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Scaffolded Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sequence graph - Optional" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-05T23:33:36Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:36Z"^^ns1:Date ; ns1:description "Performs scaffolding using HiC Data. Part of VGP assembly pipeline. The scaffolding can be performed on long read assembly contigs or on scaffolds (e.g.: Bionano scaffolds)." ; ns1:input , , , , , ; ns1:isPartOf ; ns1:keywords "vgp, Assembly, Galaxy" ; ns1:license ; ns1:name "VGP hybrid scaffolding with HiC data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-06T01:23:56Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:39Z"^^ns1:Date ; ns1:description "Performs Long Read assembly using PacBio data and Hifiasm. Part of VGP assembly pipeline. This workflow generate a phased assembly." ; ns1:input , , , ; ns1:keywords "vgp, Assembly, Galaxy" ; ns1:license ; ns1:name "VGP HiFi phased assembly with hifiasm and HiC data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Egon Willighagen" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-04-06T13:12:39Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:41Z"^^ns1:Date ; ns1:description """ # BridgeDb tutorial: Gene HGNC name to Ensembl identifier This tutorial explains how to use the BridgeDb identifier mapping service to translate HGNC names to Ensembl identifiers. This step is part of the OpenRiskNet use case to link Adverse Outcome Pathways to [WikiPathways](https://wikipathways.org/). First we need to load the Python library to allow calls to the [BridgeDb REST webservice](http://bridgedb.prod.openrisknet.org/swagger/): ```python import requests ``` Let's assume we're interested in the gene with HGNC MECP2 (FIXME: look up a gene in AOPWiki), the API call to make mappings is given below as `callUrl`. Here, the `H` indicates that the query (`MECP2`) is an HGNC symbol: ```python callUrl = 'http://bridgedb.prod.openrisknet.org/Human/xrefs/H/MECP2' ``` The default call returns all identifiers, not just for Ensembl: ```python response = requests.get(callUrl) response.text ``` 'GO:0001964\\tGeneOntology\\nuc065cav.1\\tUCSC Genome Browser\\n312750\\tOMIM\\nGO:0042551\\tGeneOntology\\nuc065car.1\\tUCSC Genome Browser\\nA0A087X1U4\\tUniprot-TrEMBL\\n4204\\tWikiGenes\\nGO:0043524\\tGeneOntology\\nILMN_1702715\\tIllumina\\n34355_at\\tAffy\\nGO:0007268\\tGeneOntology\\nMECP2\\tHGNC\\nuc065caz.1\\tUCSC Genome Browser\\nA_33_P3339036\\tAgilent\\nGO:0006576\\tGeneOntology\\nuc065cbg.1\\tUCSC Genome Browser\\nGO:0006342\\tGeneOntology\\n300496\\tOMIM\\nGO:0035176\\tGeneOntology\\nuc065cbc.1\\tUCSC Genome Browser\\nGO:0033555\\tGeneOntology\\nGO:0045892\\tGeneOntology\\nA_23_P114361\\tAgilent\\nGO:0045893\\tGeneOntology\\nENSG00000169057\\tEnsembl\\nGO:0090063\\tGeneOntology\\nGO:0005515\\tGeneOntology\\nGO:0002087\\tGeneOntology\\nGO:0005634\\tGeneOntology\\nGO:0007416\\tGeneOntology\\nGO:0008104\\tGeneOntology\\nGO:0042826\\tGeneOntology\\nGO:0007420\\tGeneOntology\\nGO:0035067\\tGeneOntology\\n300005\\tOMIM\\nNP_001104262\\tRefSeq\\nA0A087WVW7\\tUniprot-TrEMBL\\nNP_004983\\tRefSeq\\nGO:0046470\\tGeneOntology\\nGO:0010385\\tGeneOntology\\n11722682_at\\tAffy\\nGO:0051965\\tGeneOntology\\nNM_001316337\\tRefSeq\\nuc065caw.1\\tUCSC Genome Browser\\nA0A0D9SFX7\\tUniprot-TrEMBL\\nA0A140VKC4\\tUniprot-TrEMBL\\nGO:0003723\\tGeneOntology\\nGO:0019233\\tGeneOntology\\nGO:0001666\\tGeneOntology\\nGO:0003729\\tGeneOntology\\nGO:0021591\\tGeneOntology\\nuc065cas.1\\tUCSC Genome Browser\\nGO:0019230\\tGeneOntology\\nGO:0003682\\tGeneOntology\\nGO:0001662\\tGeneOntology\\nuc065cbh.1\\tUCSC Genome Browser\\nX99687_at\\tAffy\\nGO:0008344\\tGeneOntology\\nGO:0009791\\tGeneOntology\\nuc065cbd.1\\tUCSC Genome Browser\\nGO:0019904\\tGeneOntology\\nGO:0030182\\tGeneOntology\\nGO:0035197\\tGeneOntology\\n8175998\\tAffy\\nGO:0016358\\tGeneOntology\\nNM_004992\\tRefSeq\\nGO:0003714\\tGeneOntology\\nGO:0005739\\tGeneOntology\\nGO:0005615\\tGeneOntology\\nGO:0005737\\tGeneOntology\\nuc004fjv.3\\tUCSC Genome Browser\\n202617_s_at\\tAffy\\nGO:0050905\\tGeneOntology\\nGO:0008327\\tGeneOntology\\nD3YJ43\\tUniprot-TrEMBL\\nGO:0003677\\tGeneOntology\\nGO:0006541\\tGeneOntology\\nGO:0040029\\tGeneOntology\\nA_33_P3317211\\tAgilent\\nNP_001303266\\tRefSeq\\n11722683_a_at\\tAffy\\nGO:0008211\\tGeneOntology\\nGO:0051151\\tGeneOntology\\nNM_001110792\\tRefSeq\\nX89430_at\\tAffy\\nGO:2000820\\tGeneOntology\\nuc065cat.1\\tUCSC Genome Browser\\nGO:0003700\\tGeneOntology\\nGO:0047485\\tGeneOntology\\n4204\\tEntrez Gene\\nGO:0009405\\tGeneOntology\\nA0A0D9SEX1\\tUniprot-TrEMBL\\nGO:0098794\\tGeneOntology\\n3C2I\\tPDB\\nHs.200716\\tUniGene\\nGO:0000792\\tGeneOntology\\nuc065cax.1\\tUCSC Genome Browser\\n300055\\tOMIM\\n5BT2\\tPDB\\nGO:0006020\\tGeneOntology\\nGO:0031175\\tGeneOntology\\nuc065cbe.1\\tUCSC Genome Browser\\nGO:0008284\\tGeneOntology\\nuc065cba.1\\tUCSC Genome Browser\\nGO:0060291\\tGeneOntology\\n202618_s_at\\tAffy\\nGO:0016573\\tGeneOntology\\n17115453\\tAffy\\nA0A1B0GTV0\\tUniprot-TrEMBL\\nuc065cbi.1\\tUCSC Genome Browser\\nGO:0048167\\tGeneOntology\\nGO:0007616\\tGeneOntology\\nGO:0016571\\tGeneOntology\\nuc004fjw.3\\tUCSC Genome Browser\\nGO:0007613\\tGeneOntology\\nGO:0007612\\tGeneOntology\\nGO:0021549\\tGeneOntology\\n11722684_a_at\\tAffy\\nGO:0001078\\tGeneOntology\\nX94628_rna1_s_at\\tAffy\\nGO:0007585\\tGeneOntology\\nGO:0010468\\tGeneOntology\\nGO:0031061\\tGeneOntology\\nA_24_P237486\\tAgilent\\nGO:0050884\\tGeneOntology\\nGO:0000930\\tGeneOntology\\nGO:0005829\\tGeneOntology\\nuc065cau.1\\tUCSC Genome Browser\\nH7BY72\\tUniprot-TrEMBL\\n202616_s_at\\tAffy\\nGO:0006355\\tGeneOntology\\nuc065cay.1\\tUCSC Genome Browser\\nGO:0010971\\tGeneOntology\\n300673\\tOMIM\\nGO:0008542\\tGeneOntology\\nGO:0060079\\tGeneOntology\\nuc065cbf.1\\tUCSC Genome Browser\\nGO:0006122\\tGeneOntology\\nuc065cbb.1\\tUCSC Genome Browser\\nGO:0007052\\tGeneOntology\\nC9JH89\\tUniprot-TrEMBL\\nB5MCB4\\tUniprot-TrEMBL\\nGO:0032048\\tGeneOntology\\nGO:0050432\\tGeneOntology\\nGO:0001976\\tGeneOntology\\nI6LM39\\tUniprot-TrEMBL\\nGO:0005813\\tGeneOntology\\nILMN_1682091\\tIllumina\\nP51608\\tUniprot-TrEMBL\\n1QK9\\tPDB\\nGO:0006349\\tGeneOntology\\nGO:1900114\\tGeneOntology\\nGO:0000122\\tGeneOntology\\nGO:0006351\\tGeneOntology\\nGO:0008134\\tGeneOntology\\nILMN_1824898\\tIllumina\\n300260\\tOMIM\\n0006510725\\tIllumina\\n' You can also see the results are returned as a TSV file, consisting of two columns, the identifier and the matching database. We will want to convert this reply into a Python dictionary (with the identifier as key, as one database may have multiple identifiers): ```python lines = response.text.split("\\n") mappings = {} for line in lines: if ('\\t' in line): tuple = line.split('\\t') identifier = tuple[0] database = tuple[1] if (database == "Ensembl"): mappings[identifier] = database print(mappings) ``` {'ENSG00000169057': 'Ensembl'} Alternatively, we can restrivct the return values from the BridgeDb webservice to just return Ensembl identifiers (system code `En`). For this, we add the `?dataSource=En` call parameter: ```python callUrl = 'http://bridgedb-swagger.prod.openrisknet.org/Human/xrefs/H/MECP2?dataSource=En' response = requests.get(callUrl) response.text ``` 'ENSG00000169057\\tEnsembl\\n' """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.326.3" ; ns1:isBasedOn ; ns1:keywords "Toxicology, jupyter" ; ns1:license ; ns1:name "BridgeDb tutorial: Gene HGNC name to Ensembl identifier" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:Person ; ns1:name "Christina Adler" . a ns1:Person ; ns1:name "Elena Martinez" . a ns1:Person ; ns1:name "Fang Wang" . a ns1:Person ; ns1:name "Henry Lydecker" . a ns1:Person ; ns1:name "Smitha Sukumar" . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , ; ns1:dateCreated "2022-04-07T00:45:10Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:41Z"^^ns1:Date ; ns1:description """# Shotgun Metagenomics Analysis Analysis of metagenomic shotgun sequences including assembly, speciation, ARG discovery and more ## Description The input for this analysis is paired end next generation sequencing data from metagenomic samples. The workflow is designed to be modular, so that individual modules can be run depending on the nature of the metagenomics project at hand. More modules will be added as we develop them - this repo is a work in progress! These scripts have been written specifically for NCI Gadi HPC, wich runs PBS Pro, however feel free to use and modify for anothre system if you are not a Gadi user. ### Part 1. Setup and QC Download the repo. You will see directories for `Fastq`, `Inputs`, `Reference` and `Logs`. You will need to copy or symlink your fastq to `Fastq`, sample configuration file (see below) to `Inputs` and the reference genome sequence of your host species (if applicable) to `Reference` for host contamination removal. #### Fastq inputs The scripts assume all fastq files are paired, gzipped, and all in the one directory named 'Fastq'. If your fastq are within a convoluted directory structure (eg per-sample directories) or you would simply like to link them from an alternate location, please use the script `setup_fastq.sh`. To use this script, parse the path name of your fastq as first argument on the command line, and run the script from the base working directory (/Shotgun-Metagenomics-Analysis) which will from here on be referred to as `workdir`. Note that this script looks for `f*q.gz` files (ie fastq.gz or fq.gz) - if yours differ in suffix, please adjust the script accordingly. ``` bash ./Scripts/setup_fastq.sh ``` #### Configuration/sample info The only required input configuration file should be named .config, where is the name of the current batch of samples you are processing, or some other meaningful name to your project; it will be used to name output files. The config file should be placed inside the $workdir/Inputs directory, and include the following columns, in this order: ``` 1. Sample ID - used to identify the sample, eg if you have 3 lanes of sequencing per sample, erach of those 6 fastq files should contain this ID that si in column 1 2. Lab Sample ID - can be the same as column 1, or different if you have reason to change the IDs eg if the seq centre applies an in-house ID. Please make sure IDs are unique within column 1 and unique within column 2 3. Group - eg different time points or treatment groups. If no specific group structure is relevant, please set this to 1 (do not leave blank!) 3. Platform - should be Illumina; other sequencing platforms are not tested on this workflow 4. Sequencing centre name 5. Library - eg if you have 2 sequencing libraries for the same sample. Can be left blank, or assigned to 1. Blank will be assigned libray ID of 1 during processing. ``` Please do not have spaces in any of the values for the config file. #### General setup All scripts will need to be edited to reflect your NCI project code at the `-P ` and `-l directive. Please run the script create_project.sh and follow the prompts to complete some of the setup for you. Note that you will need to manually edit the PDS resource requests for each PBS script; guidelines/example resources will be given at each step to help you do this. As the 'sed' commands within this script operate on .sh and .pbs files, this setup script has been intentionally named .bash (easiest solution). Remember to submit all scripts from your `workdir`. `bash ./Scripts/create_project.sh` For jobs that execute in parallel, there are 3 scripts: one to make the 'inputs' file listing hte details of each parallel task, one job execution shell script that is run over each task in parallel, and one PBS launcher script. The process is to submit the make input script, check it to make sure your job details are correct, edit the resources directives depending on the number and size of your parallel tasks, then submit the PBS launcher script with `qsub`. #### QC Run fastQC over each fastq file in parallel. Adjust the resources as per your project. To run all files in parallel, set the number of NCPUS requested equal to the number of fastq files (remember that Gadi can only request <1 node or multiples of whole nodes). The make input script sorts the fastq files largest to smallest, so if you have a discrpeancy in file size, optimal efficiency can be achieved by requested less nodes than the total required to run all your fastq in parallel. FastQC does not multithread on a single file, so CPUs per parallel task is set to 1. Example walltimes on Gadi 'normal' queue: one 1.8 GB fastq = 4 minutes; one 52 GB fastq file = 69.5 minutes. Make the fastqc parallel inputs file by running (from `workdir`): `bash ./Scripts/fastqc_make_inputs.sh` Edit the resource requests in `fastqc_run_parallel.pbs` according to your number of fastq files and their size, then submit: `qsub fastqc_run_parallel.pbs` To ease manual inspection of the fastQC output, running `multiqc` is recommended. This will collate the individual fastQC reports into one report. This can be done on the login node for small sample numbers, or using the below script for larger cohorts. Edit the PBS directives, then run: `qsub multiqc.pbs` Save a copy of ./MultiQC/multiqc_report.html to your local disk then open in a web browser to inspect the results. #### Quality filtering and trimming Will be added at a later date. This is highly dependent on the quality of your data and your individual project needs so will be a guide only. ### Part 2. Removal of host contamination. If you have metagenomic data extracted from a host, you will need a copy of the host reference genome sequence in order to remove any DNA sequences belonging to the host. Even if your wetlab protocol included a host removal step, it is still important to run bioinformatic host removal. #### Prepare the reference Ensure you have a copy of the reference genome (or symlink) in ./Fasta. This workflow requires BBtools(tested with version 37.98). As of writing, BBtools is not available as a global app on Gadi. Please install locally and make "module loadable", or else edit the scripts to point directly to your local BBtools installation. BBtools repeat masking will use all available threads on machine and 85% of available mem by default. For a mammalian genome, 2 hours on one Gadi 'normal' node is sufficient for repeat masking. Update the name of your reference fastq in the `bbmap_prep.pbs` script (and BBtools, see note above), then run: `qsub ./Scripts/bbmap_prep.pbs` #### Host contamination removal TBC 1/4/22... """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.327.1" ; ns1:isPartOf ; ns1:keywords "Metagenomics, shotgun, antimicrobial resistance, humann2, bbmap, whole genome sequencing, Assembly, prokka, abricate, DIAMOND, kraken, braken" ; ns1:license ; ns1:name "Shotgun-Metagenomics-Analysis" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T15:09:19Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:10:58Z"^^ns1:Date ; ns1:description """# Mutation Free Energy Calculations using BioExcel Building Blocks (biobb) *** **Based on the official [pmx tutorial](http://pmx.mpibpc.mpg.de/sardinia2018_tutorial1/index.html).** *** This tutorial aims to illustrate how to compute a **fast-growth mutation free energy** calculation, step by step, using the BioExcel **Building Blocks library (biobb)**. The particular example used is the **Staphylococcal nuclease** protein (PDB code 1STN), a small, minimal protein, appropriate for a short tutorial. The **non-equilibrium free energy calculation** protocol performs a **fast alchemical transition** in the direction **WT->Mut** and back **Mut->WT**. The two equilibrium trajectories needed for the tutorial, one for **Wild Type (WT)** and another for the **Mutated (Mut)** protein (Isoleucine 10 to Alanine -I10A-), have already been generated and are included in this example. We will name **WT as stateA** and **Mut as stateB**. ![](https://raw.githubusercontent.com/bioexcel/biobb_wf_pmx_tutorial/master/biobb_wf_pmx_tutorial/notebooks/schema.png) The tutorial calculates the **free energy difference** in the folded state of a protein. Starting from **two 1ns-length independent equilibrium simulations** (WT and mutant), snapshots are selected to start **fast (50ps) transitions** driving the system in the **forward** (WT to mutant) and **reverse** (mutant to WT) directions, and the **work values** required to perform these transitions are collected. With these values, **Crooks Gaussian Intersection** (CGI), **Bennett Acceptance Ratio** (BAR) and **Jarzynski estimator** methods are used to calculate the **free energy difference** between the two states. *Please note that for the sake of disk space this tutorial is using 1ns-length equilibrium trajectories, whereas in the [original example](http://pmx.mpibpc.mpg.de/sardinia2018_tutorial1/eq.mdp) the equilibrium trajectories used were obtained from 10ns-length simulations.* *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.328.6" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Mutation Free Energy Calculations using BioExcel Building Blocks (biobb)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_left_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_right_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sampleName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sars_cov_2_reference_2bit_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sars_cov_2_reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snps" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2022-04-20T16:43:51Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:42Z"^^ns1:Date ; ns1:description "For integrative analysis of CAKUT multi-omics data DIABLO method of the mixOmics package (version 6.10.9. Singh et. al. 2019) was used with sPLS-DA (sparse Partial Least Squares Discriminant Analysis Discriminant Analysis) and PLS-DA classification." ; ns1:keywords "rare diseases" ; ns1:license ; ns1:name "EJP-RD WP13 case-study: CAKUT peptidome and miRNome data analysis using the DIABLO and PLS-DA methods from the mixOmics R package" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2022-04-20T16:59:50Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:42Z"^^ns1:Date ; ns1:description """In this analysis, we created an extended pathway, using the WikiPathways repository (Version 20210110) and the three -omics datasets. For this, each of the three -omics datasets was first analyzed to identify differentially expressed elements, and pathways associated with the significant miRNA-protein links were detected. A miRNA-protein link is deemed significant, and may possibly be implying causality, if both a miRNA and its target are significantly differentially expressed. The peptidome and the proteome datasets were quantile normalized and log2 transformed (Pan and Zhang 2018; Zhao, Wong, and Goh 2020). Before transformation, peptide IDs were mapped to protein IDs, using the information provided by the data uploaders, and were summarized into single protein-level values using geometric mean. The miRNome dataset was already normalized and transformed; thus, the information of their targeting genes was simply added to each miRNA ID, using the information provided by miTaRBase (Huang et al. 2019). As a result, all three datasets had been mapped to their appropriate gene product-level (or, protein-level) identifiers. """ ; ns1:keywords "rare diseases, Pathway Analysis, workflow, Proteomics, protein, mirna prediction" ; ns1:license ; ns1:name "EJP-RD WP13 case-study: CAKUT proteome, peptidome and miRNome data analysis using WikiPathways" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "canuConcurrency" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "corMaxEvidenceErate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "diploidOrganism" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "falseValue" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "genomeSize" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "illuminaClip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "leading" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "maxFragmentLens" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minReadLen" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minThreads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minlen" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "orientation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pacBioDataDir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pacBioInBam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pacBioTmpDir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "partialMatch" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "phredsPe" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "polishedAssembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "prefix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "readsPe1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "readsPe2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "repBaseLibrary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "slidingWindow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "taxons" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trailing" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trueValue" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "arrowAssembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assemblyMasked" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "assemblyMerged" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "canuAssembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "contaminatedReads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "correctedReads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deconClassification" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deconReport" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "decontaminatedReads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pilonAssembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sortedBamIndexFileOut" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmedReadFiles1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmedReadFiles2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmedReads" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-04-20T23:10:26Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:43Z"^^ns1:Date ; ns1:description """## CWL based workflow to assemble haploid/diploid eukaryote genomes of non-model organisms The workflow is designed to use both PacBio long-reads and Illumina short-reads. The workflow first extracts, corrects, trims and decontaminates the long reads. Decontaminated trimmed reads are then used to assemble the genome and raw reads are used to polish it. Next, Illumina reads are cleaned and used to further polish the resultant assembly. Finally, the polished assembly is masked using inferred repeats and haplotypes are eliminated. The workflow uses BioConda and DockerHub to install required software and is therefore fully automated. In addition to final assembly, the workflow produces intermediate assemblies before and after polishing steps. The workflow follows the syntax for CWL v1.0. ### Dependencies # Programs The pipeline can be run either using [Cromwell](https://cromwell.readthedocs.io/en/stable) or [cwltool reference](https://github.com/common-workflow-language/cwltool) implementation and docker containers can be run either using [Singularity](https://singularity.lbl.gov) or [udocker](https://singularity.lbl.gov). Cromwell implementation * [cromwell v44](https://github.com/broadinstitute/cromwell/releases/tag/44) * [java-jdk v8.0.112](https://www.java.com/en) Reference implementation * [cwltool v1.0.20181012180214](https://github.com/common-workflow-language/cwltool) * [nodejs v10.4.1 required by cwltool](https://nodejs.org/en) * [Python library galaxy-lib v18.5.7](https://pypi.org/project/galaxy-lib) Singularity software packages have to be installed server-wide by administrator * [Singularity v3.2.1](https://singularity.lbl.gov) * [squashfs-tools v4.3.0](https://github.com/plougher/squashfs-tools) Udocker software package can be installed locally * [udocker v1.1.2](https://github.com/indigo-dc/udocker) # Data * [Illumina adapters converted to FASTA format](http://sapac.support.illumina.com/downloads/illumina-adapter-sequences-document-1000000002694.html) * [NCBI nucleotide non-redundant sequences for decontamination with Centrifuge](http://www.ccb.jhu.edu/software/centrifuge) * [RepBase v17.02 file RMRBSeqs.embl](https://www.girinst.org/repbase) ### Installation Install miniconda using installation script ```installConda.sh```. To install CWL, use either installation script ```installCromwell.sh``` or ```installCwltool.sh```. To install udocker, use installation script ```installUdocker.sh```. To install singularity, ask your system administrator. ``` # First confirm that you have the program 'git' installed in your system > cd > git clone -b 'v0.1.3-beta' --single-branch --depth 1 https://github.com/vetscience/Assemblosis > cd Assemblosis > bash installConda.sh > bash installCromwell.sh # or bash installCwltool.sh > bash installUdocker.sh # if singularity cannot be installed or does not run ``` For data dependencies: download and extract [RepBase database](https://www.girinst.org/repbase), download Centrifuge version of [NCBI nt database](http://www.ccb.jhu.edu/software/centrifuge) and create [Illumina adapter FASTA file](http://sapac.support.illumina.com/downloads/illumina-adapter-sequences-document-1000000002694.html) to your preferred locations. If your reads are clean from adapters, the adapter FASTA file can be empty. Give the location of these data in the configuration (.yml) file (see **Usage**). ### Usage You have to create a YAML (.yml) file for each assembly. This file defines the required parameters and the location for both PacBio and Illumina raw-reads. ``` > cd > export PATH=~/miniconda3/bin:$PATH > cd Assemblosis/Run > cp ../Examples/assemblyCele.yml . "Edit assemblyCele.yml to fit your computing environment and to define the location for the read files, databases and Illumina adapters" "Running docker images using Cromwell and singularity:" > java -Dconfig.file=cromwell.udocker.conf -jar cromwell-44.jar run -t CWL -v v1.0 assembly.cwl -i assemblyCele.yml "Running docker images using Cromwell and udocker:" > java -Dconfig.file=cromwell.singularity.conf -jar cromwell-44.jar run -t CWL -v v1.0 assembly.cwl -i assemblyCele.yml "Running docker images using Cwltool and singularity:" > cwltool --tmpdir-prefix /home//Tmp --beta-conda-dependencies --cachedir /home//Cache --singularity --leave-tmpdir assembly.cwl assemblyCele.yml "Running docker images using Cwltool and udocker:" > cwltool --tmpdir-prefix /home//Tmp --beta-conda-dependencies --cachedir /home//Cache --user-space-docker-cmd udocker --leave-tmpdir assembly.cwl assemblyCele.yml ``` An annotated example of the YAML file for Caenorhabditis elegans assembly. ``` ## Directory, which contains the PacBio raw data # NOTE! The software looks for all .h5 file (or bam files if bacBioInBam below is defined true) in given directory pacBioDataDir: class: Directory location: /home//Dna ## PacBio files are in bam format as returned from Sequel platform pacBioInBam: true ## Prefix for the resultant assembly files prefix: cele ## Maximum number of threads used in the pipeline threads: 24 ## Minimum number of threads per job used in canu assembler minThreads: 4 ## Number of concurrent jobs in canu assembler (recommended to use threads / minThreads) canuConcurrency: 6 ### Parameters for the program Canu are described in https://canu.readthedocs.io/en/latest/parameter-reference.html ## Expected genome size. This parameter is forwarded to Canu assembler. genomeSize: 100m ## Minimum length for the PacBio reads used for the assembly. This parameter is forwarded to Canu assembler. # The maximum resolvable repeat regions becomes 2 x minReadLength minReadLen: 6000 ## Parameter for Canu assembler to adjust to GC-content. Should be 0.15 for high or low GC content. corMaxEvidenceErate: 0.20 ### Parameters for the program Trimmomatic are described in http://www.usadellab.org/cms/?page=trimmomatic ## Paired-end (PE) reads of Illumina raw data. These files are given to the program Trimmomatic. # NOTE! Data for two paired libraries is given below. readsPe1: - class: File format: edam:format_1930 # fastq path: /home//Dna/SRR2598966_1.fastq.gz - class: File format: edam:format_1930 # fastq path: /home//Dna/SRR2598967_1.fastq.gz readsPe2: - class: File format: edam:format_1930 # fastq path: /home//Dna/SRR2598966_2.fastq.gz - class: File format: edam:format_1930 # fastq path: /home//Dna/SRR2598967_2.fastq.gz ## Phred coding of Illumina data. This parameter is forwarded to Trimmomatic. # NOTE! Each read-pair needs one phred value. phredsPe: ['33','33'] ## Sliding window and illuminaClip parameters for Trimmomatic slidingWindow: windowSize: 4 requiredQuality: 25 illuminaClip: adapters: class: File path: seedMismatches: 2 palindromeClipThreshold: 30 simpleClipThreshold: 10 minAdapterLength: 20 keepBothReads: true ## Further parameters for Trimmomatic # Required phred-quality for leading 5 nucleotides leading: 25 # Required phred-quality for trailing 5 nucleotides trailing: 25 # Minimum accepted read-length to keep the read after trimming minlen: 40 ### Parameters for the program bowtie2 are described in http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml ## Illumina PE fragment length. Program bowtie2 parameter -X. # NOTE! Each read-pair needs one phred value. maxFragmentLens: [500, 600] # Orientation of pair-end reads e.g. 'fr', 'rf', 'ff': Program bowtie2 parameters --fr, --rf or --ff orientation: 'fr' ### Parameters for the program Pilon are described in https://github.com/broadinstitute/pilon/wiki/Requirements-&-Usage # Prefix for the resultant pilon polished assembly. Pilon parameter --output polishedAssembly: celePilon # This is set 'true' for an organism with diploid genome: Pilon parameter --diploid diploidOrganism: true # Value 'bases' fixes snps and indels: Pilon parameter --fix fix: bases ### Parameters for the program centrifuge are described in http://www.ccb.jhu.edu/software/centrifuge/manual.shtml # Path to the directory, that contains NCBI nt database in nt.?.cf files. Centrifuge parameter -x database: class: Directory path: /home//ntDatabase # Lenght of the identical match in nucleotides required to infer a read as contaminant. Centrifuge parameter --min-hitlen partialMatch: 100 # NCBI taxon root identifers for the species considered contaminants: e.g. bacteria (=2), viruses (=10239), fungi (=4751), mammals (=40674), artificial seqs (=81077). Pipeline specific parameter. taxons: [2,10239,4751,40674,81077] ## Parameters for the RepeatModeler and RepeatMasker are described in http://www.repeatmasker.org repBaseLibrary: class: File # This is the RepBase file from https://www.girinst.org/repbase. RepeatMasker parameter -lib path: /home//RepBaseLibrary/RMRBSeqs.embl # Constant true and false values for repeat masker trueValue: true falseValue: false ``` ### Runtimes and hardware requirements The workflow was tested in Linux environment (CentOS Linux release 7.2.1511) in a server with 24 physical CPUs (48 hyperthreaded CPUs) and 512 GB RAM. | Assembly | Runtime in CPU hours | RAM usage (GB) | | --- | --- | --- | | *Caenorhabditis elegans* | 1537 | 134.1 | | *Drosophila melanogaster* | 6501 | 134.1 | | *Plasmodium falciparum* | 424 | 134.1 | Maximum memory usage of 134.1 GB was claimed by the program Centrifuge for each assembly. ### Software tools used in this pipeline * [Dextractor v1.0](https://github.com/thegenemyers/DEXTRACTOR) * [Trimmomatic v0.36](http://www.usadellab.org/cms/?page=trimmomatic) * [Centrifuge v1.0.3](http://www.ccb.jhu.edu/software/centrifuge) * [Canu v1.8](http://canu.readthedocs.io/en/latest/index.html) * [Arrow in SmrtLink v7.0.1](https://www.pacb.com/support/software-downloads) * [Bowtie 2 v2.2.8](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) * [SAMtools v1.6](http://samtools.sourceforge.net) * [Pilon v1.22](https://github.com/broadinstitute/pilon) * [RepeatMasker v4.0.6](http://www.repeatmasker.org) * [RepeatModeler v1.0.11](http://www.repeatmasker.org) * [RepBase v17.02](https://www.girinst.org/repbase) * [HaploMerger2 build_20160512](https://github.com/mapleforest/HaploMerger2) ### Cite If you use the pipeline, please cite: Korhonen, Pasi K., Ross S. Hall, Neil D. Young, and Robin B. Gasser. "Common Workflow Language (CWL)-based software pipeline for de novo genome assembly from long-and short-read data." GigaScience 8, no. 4 (2019): giz014. """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Assemblosis" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-04-20T23:21:34Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:45Z"^^ns1:Date ; ns1:description """# ESCALIBUR Escalibur Population Genomic Analysis Pipeline is able to explore key aspects centering the population genetics of organisms, and automates three key bioinformatic components in population genomic analysis using Workflow Definition Language (WDL: https://openwdl.org/), and customised R, Perl, Python and Unix shell scripts. Associated programs are packaged into a platform independent singularity image, for which the definition file is provided. The workflow for analysis using Escalibur consists of three steps - each step can be run in a separate workflow in a sequential manner; step 2 is optional. 1. Trimming and mapping the raw data - selection of the best reference genome; 2. Removing the contamination from mapped data; 3. Recalibration, variant calling and filtering; This implementation runs both locally and in a distributed environment that uses SLURM job scheduler. ## Dependencies Following software dependencies are required: * Git * SLURM scheduler required for distributed HPC environment (https://slurm.schedmd.com/documentation.html) * Python3.7: (https://www.python.org/) * Perl 5.26.2: (https://www.perl.org/) * Java 1.8 * Singularity 3.7.3: (https://sylabs.io/singularity/) ## Step 1: Installation Typically, the installation of Singularity requires root rights. You should therefore contact your administrator to get it correctly installed. Minimum Linux kernel version requirement is 3.8, thought >= 3.18 would be preferred (https://sylabs.io/guides/3.5/admin-guide/installation.html). Clone the git repository to a directory on your cluster or stand-alone server. ``` > git clone --depth 1 -b v0.3-beta https://gitlab.unimelb.edu.au/bioscience/escalibur.git > cd escalibur ``` ### Description of Files * `workflow-main.local.config`: main configuration file for stand alone server runtime environment * `workflow-main.slurm.config`: main configuration file for HPC runtime environment that support Slurm job scheduler * `workflow-mapping.json`: defines location of input files, has behavioral settings and sets resource allocations * `workflow-cleaning.json`: defines location of input files and sets resource allocations * `workflow-variants.json`: defines location of input files, has behavioral settings and sets resource allocations * `workflow-mapping.wdl`: main workflow file to trim and map PE reads into the genome * `workflow-cleaning.wdl`: main workflow file to clean contamination from mapped PE reads against genomes representing putative contamination * `workflow-variants.wdl`: main workflow file to call variants using mapped and cleaned reads * `workflow-mapping.outputs.json`: defines location for resultant outputs and logs from mapping workflow * `workflow-cleaning.outputs.json`: defines location for resultant outputs and logs from cleaning workflow * `workflow-variants.outputs.json`: defines location for resultant outputs and logs from variants workflow * `inputReads.txt`: example input file for fastq read files to mapping step * `cleanup.conf`: example configuration file for putative host contamination to cleaning step * `inputBams.txt`: example input file for resultant BAM files to variant calling step * `references.txt`: contains list of example references genomes * `perl_scripts`: contains Perl scripts used by the pipeline * `scripts`: contains Python scripts used by the pipeline * `R_scripts`: contains R scripts used by the pipeline * `sub_workflows`: sub-workflows, one for each of the workflow steps * `tasks`: workflow tasks * `cromwell-50.jar`: java archive file required to run the workflow. Two config files have been created. One for stand alone server (`workflow-runtime.local.config`) and another one for HPC environment that supports Slurm scheduler (`workflow-runtime.slurm.config`). These files have already been optimised. For slurm configuration you only need to define the HPC partition in line 35: "String rt_queue" Change this to the partition you have access to on HPC environment. Files `workflow-mapping.outputs.json`, `workflow-cleaning.outputs.json` and `workflow-variants.outputs.json` define the directories to copy the result files to. Modify if you want to change default output directories `outputMapping`, `outputCleaning` and `outputVariants`. These output directories are generated to the directory `escalibur`. #### NOTE: delete output directories from previous runs. If you have files there already and a name matches during the copy, the workflow may fail. `Singularity` directory contains the definition file for the software used in Escalibur. Pre-built singularity image can be downloaded from `library://pakorhon/workflows/escalibur:0.0.1-beta`. ``` > singularity pull escalibur.sif library://pakorhon/workflows/escalibur:0.0.1-beta ``` ## Step 2: Test run To confirm correct function of the workflows (`mapping`, `cleaning` and `variant calling`), fix the required absolute paths, marked by three dots `...` in `workflow-mapping.json`, `workflow-cleaning.json` and `workflow-variants.json` and configuration files `cleanup.conf` and `inputBams.txt`, and run the workflow with the provided test and configuration files, and parameter settings. ``` > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-mapping.wdl -i workflow-mapping.json -o workflow-mapping.outputs.json > out.mapping 2> err.mapping > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-cleaning.wdl -i workflow-cleaning.json -o workflow-cleaning.outputs.json > out.cleaning 2> err.cleaning > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-variants.wdl -i workflow-variants.json -o workflow-variants.outputs.json > out.variants 2> err.variants ``` Slurm file templates `runMapping.slurm`, `runCleaning.slurm` and `runVariants.slurm` are available for each workflow. #### NOTE: default parameter settings for run-times, memory usage and module loading may require adjustment in these files if run in HPC environment using slurm. Current settings should account for the test run. After the runs are complete, the results will be at the output directories: `outputMapping`, `outputCleaning` and `outputVariants`. You can compare the result of `outputVariants/full_genotype_output.vcf` to that or pre-run `TestResults/full_genotype_output.vcf`. ## Step 3: Mapping Make a directory for your fastq files e.g. `Reads` and copy your paired end raw data in there. ``` > mkdir Reads ``` It should look something like below ``` > ls TestReads/ 1-1_r1.fastq.gz 32-1_r1.fastq.gz 44-1_r1.fastq.gz 1-1_r2.fastq.gz 32-1_r2.fastq.gz 44-1_r2.fastq.gz ``` Run the python script to create a file of your input samples and edit the resulting file to match your sample identifiers and libraries. ``` > python3 scripts/inputArgMaker.py -d Reads/ -p -ps 33 -pq 20 -pl ILLUMINA -ml 50 -o inputReads.txt ``` The edited output file is shown below. The script will automatically sort the files by size. ``` > cat inputReads.txt # Prefix PE/SE MinLen PhredS Sequencer PhredQ Library Read Group ID Sample Platform Unit First pair of PE reads Second pair of PE reads test1 PE 50 33 ILLUMINA 28 LIB1 CL100082180L1 SM1 CL100082180L1 ./TestReads/1-1_r1.fastq.gz ./TestReads/1-1_r2.fastq.gz test2 PE 50 33 ILLUMINA 20 LIB2 CL100082180L1 SM2 CL100082180L1 ./TestReads/44-1_r1.fastq.gz ./TestReads/44-1_r2.fastq.gz test3 PE 50 33 ILLUMINA 20 LIB3 CL100034574L1 SM2 CL100034574L1 ./TestReads/32-1_r1.fastq.gz ./TestReads/32-1_r2.fastq.gz ``` #### NOTE: If several libraries are embedded in a single read file, library-specific reads have to be separated into own files before create the inputReads.txt file. In contrast, inputReads.txt file format can accommodate multiple library files to a single sample. * `Prefix`: Prefix for the resultant files from trimming. * `PE/SE`: Paired-End/Single-End reads as input. * `MinLen`: Minimum Length of reads after trimming. * `PhredS`: Used Phred coding by the sequencer (33 or 64). * `Sequencer`: Name of the sequencer. * `PhredQ`: Phred cut-off score used in trimming. * `Library`: Identifier for the library. * `Read Group ID`: Identifier for the read groups required by GATK (inputArgMaker tries to find this from FASTQ reads). Refer to (https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups). * `Sample`: Identifier for the sample. Defined prefix for resultant sample specific files. * `Platform Unit (optional)`: Information about flow cell, lane and sample. Helps GATK in recalibration (inputArgMaker copies Read Group ID here). Refer to (https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups). * `First pair of PE reads`: Relative path to the forward pair of PE reads. * `Second pair of PE reads`: Relative path to the reverse pair of PE reads. Create a file listing reference genomes and configure `workflow-mapping.json` file. An example reference file (`references.txt`) has been created for you. Use this as an example to create your own. Ensure there are no whitespaces at the end of the line or else the cromwell engine will throw an error. Reads are mapped to these reference files and the best matching reference will be selected for variant calling. ``` > cat references.txt scf00001 ./TestReferences/scf00001.fa scf00013 ./TestReferences/scf00013.fa ``` #### NOTE: Reference label (e.g. `scf00001`) must be a substring found in the reference fasta file (`scf00001.fa`) The figure below illustrates the flow of the information, and appearance of labels (`Prefix`, `Sample`, `Label`) in file names, as defined in `inputReads.txt` and `references.txt`. ![](figures/labelFlow.png) ### workflow-mapping.json config file Add the path of your fastq and reference genome input files and change parameters as appropriate, and adjust the absolute paths for singularity image. If `mapping_workflow.readQc` is set to `yes`, reads are trimmed both for quality and the adapters. Adapters to trim are given in `mapping_workflow.pe_filtering_workflow.trimmomatic_pe_task.truseq_pe_adapter`. If you want to use custom adapters, copy them to `adapters` directory and instead of default `TruSeq3-PE.fa`, refer to your custom file. If you don't want to use adapters, use `empty.fa` file instead. For BGISEQ adapters, refer to (https://en.mgitech.cn/Download/download_file/id/71). ``` { "## CONFIG FILE": "WDL", "mapping_workflow.inputSampleFile": "./inputReads.txt", "mapping_workflow.inputReferenceFile": "./references.txt", "## Parameters for samtools read filtering": "-F 4 does filters unmapped reads from resultant files", "mapping_workflow.samtoolsParameters": "-F 4", "## Is read QC required": "yes or no", "mapping_workflow.readQc": "yes", "## What is the ploidy of given genome": "1 for haploid, 2 for diploid, etc.", "mapping_workflow.ploidy": 2, "## Singularity parameters": "absolute paths to the container and the directory to bind visible inside singularity", "mapping_workflow.singularityContainerPath": "/home/.../escalibur/escalibur.sif", "mapping_workflow.singularityBindPath": "/home/.../escalibur/", "## trimmomatic adapters": "", "mapping_workflow.pe_filtering_workflow.trimmomatic_pe_task.truseq_pe_adapter":"./adapters/TruSeq3-PE.fa", "mapping_workflow.pe_filtering_workflow.trimmomatic_se_task.truseq_se_adapter":"./adapters/TruSeq3-SE.fa", "## Indexing sub workflow task parameters": "Samtools index run time parameters", "mapping_workflow.index_sub_workflow.indexing_sam_task.IST_minutes": 300, "mapping_workflow.index_sub_workflow.indexing_sam_task.IST_threads": 16, "mapping_workflow.index_sub_workflow.indexing_sam_task.IST_mem": 30000, . . . } ``` Run the mapping workflow. ``` > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-mapping.wdl -i workflow-mapping.json -o workflow-mapping.outputs.json > out.mapping 2> err.mapping ``` The resultant BAM files will be copied to `outputMapping` directory. ## Step 4 (optional): Cleaning If you suspect 'host' contamination in your data, you can remove that using the cleaning workflow. Define the file representing the contamination. First column defines the sample identifier, second the resultant BAM file from mapping workflow and third the putative contaminant genome assembly. ``` > cat cleanup.conf SM1 /home/.../escalibur/outputMapping/SM1.scf00001.MarkDup.bam /home/.../escalibur/Hosts/host1.fa SM2 /home/.../escalibur/outputMapping/SM2.scf00001.MarkDup.bam /home/.../escalibur/Hosts/host1.fa ``` #### NOTE: you have to use absolute paths both to BAM files and the contaminant reference genome (here `host1.fa` and `host2.fa`). ### workflow-cleaning.json config file Add the path of your cleaning config file (here `cleanup.conf`) and adjust the absolute paths for singularity image. ``` { "## CONFIG FILE": "WDL", "cleaning_workflow.inputContaminantFile": "./cleanup.conf", "## Singularity parameters": "absolute paths to the container and the directory to bind visible inside singularity", "cleaning_workflow.singularityContainerPath": "/home/.../escalibur/escalibur.sif", "cleaning_workflow.singularityBindPath": "/home/.../escalibur/", "cleaning_workflow.indexing_bwa_task.IBT_minutes": 60, "cleaning_workflow.indexing_bwa_task.IBT_threads": 1, "cleaning_workflow.indexing_bwa_task.IBT_mem": 16000, "######################################":"########################################", "CLEANING":"PARAMETERS", "######################################":"########################################", "cleaning_workflow.clean_bams_workflow.cleanBams_task.CLEAN_BAMS_minutes": 600, "cleaning_workflow.clean_bams_workflow.cleanBams_task.CLEAN_BAMS_threads": 4, "cleaning_workflow.clean_bams_workflow.cleanBams_task.CLEAN_BAMS_mem": 32000, "cleaning_workflow.create_cleaned_bams_workflow.createCleanedBams_task.CREATE_CLEAN_BAMS_minutes": 300, "cleaning_workflow.create_cleaned_bams_workflow.createCleanedBams_task.CREATE_CLEAN_BAMS_threads": 4, "cleaning_workflow.create_cleaned_bams_workflow.createCleanedBams_task.CREATE_CLEAN_BAMS_mem": 32000, "cleaning_workflow.refsBySample.RBS_minutes": 5, "cleaning_workflow.refsBySample.RBS_threads": 1, "cleaning_workflow.refsBySample.RBS_mem": 4000 } ``` Run the cleaning workflow. ``` > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-cleaning.wdl -i workflow-cleaning.json -o workflow-cleaning.outputs.json > out.cleaning 2> err.cleaning ``` The resultant cleaned BAM files will be copied to `outputCleaning` directory. You can repeat the workflow if you suspect that there may be more than one contaminant genomes per each sample. In that case you have to take care of the properly configured `cleanup.conf` file that should describe the BAM files from previous cleaning round but also define new output directory for each round in `workflow-cleaning.outputs.json` file. ## Step 5: Variant calling Define the file listing the BAM files used for variant calling. First column defines the sample identifier, and second the resultant BAM file either from mapping of cleaning workflow. ``` > cat inputBams.txt SM1 /home/.../escalibur/outputMapping/SM1.scf00001.MarkDup.bam SM2 /home/.../escalibur/outputCleaned/SM2.scf00001.MarkDup.cleaned.bam ``` ### workflow-variants.json config file Add the path of your file listing the locations of BAM files (here `inputBams.txt`), and add the location to selected reference genome (found in `outputMapping/best.ref`) and it's label, as defined in `references.txt` file. Adjust the absolute paths for singularity image and adjust other parameters, especially define if you want to recalibrate the BAM files by selecting value "independent" to "variants_workflow.call_type". ``` { "## CONFIG FILE": "WDL", "variants_workflow.inputSampleFile": "./inputBams.txt", "variants_workflow.selectedRefFile": "TestReferences/scf00001.fa", "variants_workflow.selectedRefLabel": "scf00001", "## Singularity parameters": "absolute paths to the container and the directory to bind visible inside singularity", "variants_workflow.singularityContainerPath": "/home/.../escalibur/escalibur.sif", "variants_workflow.singularityBindPath": "/home/.../escalibur/", "## Which variant call workflow to use": "fast or independent", "variants_workflow.call_type": "fast", "## Variant filtering expressions": "For SNPs and INDELs", "variants_workflow.SNP_filt_exp": "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0", "variants_workflow.INDEL_filt_exp": "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0", "## Variant Filter params": "Variant filter, indel, snps, report making: Safe to leave as default", "variants_workflow.ploidy": 2, "variants_workflow.maxIndelSize": 60, "variants_workflow.scafNumLim": 95, "variants_workflow.scafNumCo": 2, "variants_workflow.scafLenCutOff": 0, "variants_workflow.ldWinSize": 10, "variants_workflow.ldWinStep": 5, "variants_workflow.ldCutOff": 0.3, "variants_workflow.snp_indel_var_filtering_workflow.indelFilterName": "Indel_filter", "variants_workflow.snp_indel_var_filtering_workflow.indelFilterExpression": "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0", "variants_workflow.snp_indel_var_filtering_workflow.snpFilterName": "Snp_filter", "variants_workflow.snp_indel_var_filtering_workflow.snpFilterExpression": "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0", "variants_workflow.snp_indel_var_filtering_workflow.vfindel_tk.selectType": "", "variants_workflow.snp_indel_var_filtering_workflow.vfsnp_tk.selectType": "", "## Build chromosome map":"map_def_scf_lim_task", "variants_workflow.snp_indel_var_filtering_workflow.map_def_scf_lim_task.scafLenCutOff": 1000000, "variants_workflow.snp_indel_var_filtering_workflow.map_def_scf_lim_task.scafNumCo": 3, "## Indexing sub workflow task parameters": "Samtools index run time parameters", "variants_workflow.ref_index.IST_minutes": 300, "variants_workflow.ref_index.IST_threads": 2, "variants_workflow.ref_index.IST_mem": 8000, . . . } ``` Run the variant calling workflow. ``` > java -Dconfig.file=./workflow-runtime.local.config -jar ./cromwell-50.jar run workflow-variants.wdl -i workflow-variants.json -o workflow-variants.outputs.json > out.variants 2> err.variants ``` The resultant files will be copied to `outputVariants` directory. That includes filtered variants calls (`full_genotype_output.vcf`) and recalibrated BAM files (if independent call_type is selected). ## Other considerations ### Resource allocation in HPC environment Wall time, memory usage and thread count (`_minutes`, `_mem`, `_threads`) given in `.json` files for each workflow can vary substantially and may require adjusting in HPC environment and slurm. This may lead to frequent restarting of the workflow after each adjustment. We have automated this task by providing scripts that automatically check the failed resource allocations and double them for each round. These scripts are located in `Automation` directory and can be run as follows: ``` > cd Automation > sh init.sh # Copies the content of ../tasks directory to tasksOrig directory > sbatch runMapping.slurm # Runs runLoopMapping.sh in a worker node > sbatch runCleaning.slurm # Runs runLoopCleaning.sh in a worker node > sbatch runVariants.slurm # Runs runLoopVariants.sh in a worker node ``` Scripts `runLoop*.sh` copy resource allocations from collective `runtimes.json` file to the files in `../tasks` directory, run the workflow and double the failed resource allocations in `../tasks` files, and reruns the workflow until it succeeds or until ten rounds have passed. Copying of resource allocations directly to the files in `../tasks` directory is necessary to guarantee proper function of call-caching. #### NOTE: automated resource allocation adjustment is experimental, should be monitored when running and may require modifications to scripts to function properly. ### Disk usage Cromwell will create duplicate copies of files while running the workflows. It is therefore recommended to remove `cromwell-executions` directory after each workflow is run, if disk space is getting sparse. ``` > rm -r cromwell-executions ``` Especially, if there are hundreds of samples that may sum up to terabytes of data, disk space might become an issue if unused files are not removed. ### Troubleshooting If the output text does not reveal the error, you can try to find an error message using command(s): ``` > find cromwell-executions/ -name stderr -exec cat {} \\; | grep -i fatal > find cromwell-executions/ -name stderr -exec cat {} \\; | less ``` Most commonly encountered error cases: * Singularity is not running correctly. Typically you require help from your administrator to get singularity properly installed. * Singularity image `escalibur.sif` was not downloaded * Check that you are using correct runtime configuration file `workflow-runtime.local.config` or `workflow-runtime.slurm.config` when calling `cromwell-50.jar` * Absolute file paths for Singularity/Trimmomatic, input files or contaminant genomes are not updated or are wrong in `workflow-*.json`, `inputBams.txt` or `cleanup.conf` configuration files, respectively. * Defined run-time and memory requirements for some tasks are not sufficient in `.json` configuration files to run the pipeline in HPC environment. * If you are using slurm job scheduler and want to run the pipeline in HPC environment, you have to create the related configuration file yourselves. * Pipeline has not been tested in other environments but Linux and we expect that users encounter challenges if trying to run the pipeline e.g. in Mac environment. """ ; ns1:keywords "" ; ns1:license ; ns1:name "Escalibur" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Deduplicate reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output Destination" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Don't output reads." . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filter rRNA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filter human reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Keep mapped reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum memory in MB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filter reference file(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Skip QC filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Skip QC unfiltered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input URLs used for this run" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered forward read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered reverse read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtering reports folder" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2025-11-06T13:41:16Z"^^ns1:Date ; ns1:dateModified "2025-11-06T16:37:28Z"^^ns1:Date ; ns1:description """**Workflow for short paired end reads quality control, trimming and filtering.**
Multiple paired datasets will be merged into single paired dataset.
Summary: - Sequali QC on raw data files
- fastp for read quality trimming
- BBduk for phiX and rRNA filtering (optional)
- Filter human reads using Hostile (optional)
- Custom read filtering using Hostile (optional)
- Sequali QC on filtered (merged) data
Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default

**All tool CWL files and other workflows can be found at:**
https://gitlab.com/m-unlock/cwl **How to setup and use an UNLOCK workflow:**
https://docs.m-unlock.nl/docs/workflows/setup.html
""" ; ns1:image ; ns1:input , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "illumina, Genomics, Transcriptomics, quality, filtering, Classification" ; ns1:license ; ns1:name "Short read quality control, trimming and contamination filter" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output Destination" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Contamination reference file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "identifier used" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Keep mapped reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum memory in MB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nanopore reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CWL base step number" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered nanopore reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtering reports folder" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-04-21T16:19:59Z"^^ns1:Date ; ns1:dateModified "2023-04-07T14:07:55Z"^^ns1:Date ; ns1:description """### Workflow for LongRead Quality Control and Filtering - NanoPlot (read quality control) before and after filtering - Filtlong (read trimming) - Kraken2 taxonomic read classification before and after filtering - Minimap2 read filtering based on given references

Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default

**All tool CWL files and other workflows can be found here:**
https://gitlab.com/m-unlock/cwl/workflows **How to setup and use an UNLOCK workflow:**
https://m-unlock.gitlab.io/docs/setup/setup.html
""" ; ns1:image ; ns1:input , , , , , , , , ; ns1:keywords "Genomics, nanopore, CWL, Assembly" ; ns1:license ; ns1:name "LongRead Quality Control and Filtering" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bowtie2 on input dataset(s): alignments" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DESeq2 plots on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DESeq2 result file on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC on input dataset(s): RawData" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC on input dataset(s): Webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts on input dataset(s): Counts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts on input dataset(s): Summary" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-03T13:05:01Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description """Objective. Biomarkers have become important for the prognosis and diagnosis of various diseases. High-throughput methods such as RNA-sequencing facilitate the detection of differentially expressed genes (DEGs), hence potential biomarker candidates. Individual studies suggest long lists of DEGs, hampering the identification of clinically relevant ones. Concerning preeclampsia, a major obstetric burden with high risk for adverse maternal and/or neonatal outcomes, limitations in diagnosis and prediction are still important issues. Therefore, we developed a workflow to facilitate the screening for biomarkers. Methods. Based on the tool DeSeq2, we established a comprehensive workflow for the identification of DEGs, analyzing data from multiple publicly available RNA-sSequencing studies. We applied it to four RNA-sSequencing datasets (one blood, three placenta) analyzing patients with preeclampsia and normotensive controls. We compared our results with other published approaches and evaluated their performance. Results. We identified 110 genes dysregulated in preeclampsia, observed in ≥3 of the analyzed studies, six even in all four studies. Among them were FLT-1, TREM-1, and FN1 which either represent established biomarkers on protein level, or promising candidates based on recent studies. In comparison, using a published meta-analysis approach we obtained 5,240 DEGs. Conclusions. We present a data analysis workflow for preeclampsia biomarker screening, capable of identifying significant biomarker candidates, while drastically decreasing the numbers of candidates. Moreover, we were also able to confirm its performance for heart failure. Our approach can be applied to additional diseases for biomarker identification and the set of identified DEGs in preeclampsia represents a resource for further studies. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.338.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Biomarker screening in preeclampsia" ; ns1:output , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Shell Script" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2022-05-05T05:02:08Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:52Z"^^ns1:Date ; ns1:description """# GermlineShortV_biovalidation - [Description](#description) - [Diagram](#diagram) - [User guide](#user-guide) - [Quick start guide](#quick-start-guide) - [Benchmarking](#benchmarking) - [Workflow summaries](#workflow-summaries) - [Metadata](#metadata) - [Component tools](#component-tools) - [Required (minimum) inputs/parameters](#required-minimum-inputsparameters) [Preparing your own input files](#preparing-input-files) - [Additional notes](#additional-notes) - [Understanding your outputs](#understanding-your-outputs) - [Performance metrics explained](#performance-metrics-explained) - [Help/FAQ/Troubleshooting](#helpfaqtroubleshooting) - [Acknowledgements/citations/credits](#acknowledgementscitationscredits) ## Description Population-scale WGS cohorts are essential resources for genetic analyses including heritable diseases, evolutionary genomics, conservation biology, and population genomics. Processing raw reads into analysis-ready variants remains challenging. Various mapping and variant calling pipelines have been made publicly available in recent decades. Designing a mapping and variant calling pipeline to meet your needs is dependent on the compute infrastructure you’re working on, the types of variants you’re primarily interested in, and the sequencing technology you use to generate raw sequencing data. Keep in mind that the tools you use to build your pipeline can affect variant calling accuracy. Further, optimisation and customisation of these tools’ commands can also affect their performance. Best-practice recommendations for variant calling pipelines vary dramatically between species and research questions, depending on the availability of genomic resources for the population of interest, genome structure, and clinical relevance of the resulting variant dataset. It is important to not only design a robust variant calling pipeline but also fine-tune it to achieve optimal performance for your dataset and research question. There are various measurements that you can apply to evaluate the biological accuracy of your germline variant calling pipeline. Currently, no best practice methods for interrogating joint-called variant sets exist in the literature. A number of publicly available, human ‘gold standard’ truth datasets including Platinum Genomes and Genome in a Bottle (GIAB) are useful for benchmarking across high confidence regions of the genome and evaluating the recall and precision of the pipeline. We recommend individuals working with human datasets benchmark their germline variant calling pipelines using one of these datasets. Unfortunately, these resources are not typically available for non-human organisms. Here, we present protocols for benchmarking and validating germline short variant (SNVs and indels) datasets using a combination of methods that can capture the quality of your variant sets for human, non-human model, and non-model organisms. The process you can apply will depend on the organism you’re working with and the genomic resources available to that organism. ## Diagram

## User guide ### Quick start guide These bash scripts were written for the University of Sydney’s high performance computer, Artemis. They can be run on the command line or submitted as PBS jobs. These scripts assume your input is a gzipped multi-sample (cohort) VCF file. Before running, edit the PBS project directive and define the variables at the top of the script. All software used in this protocol is installed on Artemis- to use alternate versions or run on a different compute infrastructure, edit the modules according to your needs. #### Human datasets For human datasets, we recommend you benchmark your germline variant calling pipeline using a gold standard dataset such as Platinum Genomes. Raw sequence data in FASTQ format for these datasets can be downloaded along with their high confidence variant calls and regions from public repositories. See [Preparing input files]() for more information on how to download and prepare these files. ##### 1. Collect vcf summary metrics Edit the PBS -P directive and variables for your dataset in `vcfstat.sh`. Then run script with: ``` qsub vcfstat.sh (or bash vcfstat.sh) ``` This will produce summary and quality metrics reports and plots for your cohort. It will also produce summary and detail files for known variant representation. BCFtools stats plots will be housed in a directory labelled `${cohort}_vcfplots`. ##### 2. Biological benchmarking using a truth set Edit the PBS -P directive and variables for your files. Then run script with: ``` qsub run_happy.sh ``` This script will subset your multi-sample VCF into individual samples, prepare them for hap.py, and output a number of files including summary metrics (including recall, precision and F1-score) and ROC count files that can be used to produce ROC curves, separately for SNVs and indels. See the [hap.py user guide](https://github.com/Illumina/hap.py/blob/master/doc/happy.md) for more information on how to interpret hap.py output. ROC curves of Hap.py runs can be plotted using the script [rocplot.Rscript](https://github.com/Illumina/hap.py/blob/master/src/R/rocplot.Rscript). #### Non-human model organism datasets ##### 1. Collect vcf summary metrics Edit the PBS -P directive and variables for your dataset in `vcfstat.sh`. We recommend you use the set of known variants used for base quality score recalibration to validate population level variants. If you used trio data, unhash the Mendelian error command within the script. Then run script with: ``` qsub vcfstat.sh (or bash vcfstat.sh) ``` This will produce summary and quality metrics reports and plots for your cohort. It will also produce summary and detail files for known variant representation. BCFtools stats plots will be housed in a directory labelled `${cohort}_vcfplots`. #### Non-model organism datasets ##### 1. Collect vcf summary metrics Edit the PBS -P directive and variables for your dataset in `vcfstat_nonmodel.sh`. Then run script with: ``` qsub vcfstat_nonmodel.sh (or bash vcfstat_nonmodel.sh) ``` This will produce summary and quality metrics reports and plots for your cohort. It will also produce summary and detail files for known variant representation. BCFtools stats plots will be housed in a directory labelled `${cohort}_vcfplots`. ## Benchmarking Coming soon! ## Workflow summaries ### Metadata |metadata field | workflow_name / workflow_version | |-------------------|:---------------------------------:| |Version | 1.0 | |Maturity | stable | |Creators | Georgie Samaha, Tracy Chew, Cali Willet | |Source | NA | |License | NA | |Workflow manager | NA | |Container | None | |Install method | Manual | |GitHub | NA | |bio.tools | NA | |BioContainers | NA | |bioconda | NA | ### Component tools bcftools/1.14 htslib/1.14 python/3.8.2 R/4.1.1 hap.py/0.3.14 ### Required (minimum) inputs/parameters - Multi-sample or single sample VCF file (VCF.gz format) - List of sample IDs that match the VCF (.txt format) - Known variant dataset (VCF format. Human and non-human model organisms only) - Pedigree file (format: mother,father,offspring. Trios or Platinum Genomes only) - Truth set variant calls (VCF.gz format. Human, Platinum Genomes only) - High confidence call regions (BED format. Human, Platinum Genomes only) ### Preparing input files #### Gold standard variant truth sets The benchmarking protocol for human datasets assumes you have performed mapping and germline variant calling on a gold standard truth set. These datasets contain millions of variants that have been confirmed using orthologous technologies [Eberle et al. 2017](https://doi.org/10.1101/gr.210500.116). We recommend you use the Platinum Genomes dataset for benchmarking germline variant calling pipelines that include joint genotyping of multiple samples. Six members, comprising two trios, of the Platinum Genomes dataset can be downloaded from the Illumina BaseSpace Sequence Hub, the ENA, or dbGaP. The Platinum Genomes dataset contains multiple files including the following files you will need for running `run_happy.sh`: - Paired-end FASTQ files for each sample - High-confidence germline variant VCF files for each sample - High-confidence genomic regions (BED format) Currently, these files are available for Hg19 (GRCh37) and Hg38 (GRCh38) . Links to raw data are [here](https://github.com/Illumina/PlatinumGenomes). BaseSpace offers a command line tool for downloading files, see [here](https://developer.basespace.illumina.com/docs/content/documentation/cli/cli-examples) for instructions. #### Providing your own ‘truth set’ *A word of caution*- testing the performance of your pipeline using a truth set is only intended to estimate the overall quality of your pipeline and detect any potential sources of error in your method. It is not intended to test the truthfulness of your variant set. See [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531572-Evaluating-the-quality-of-a-germline-short-variant-callset) for further discussion of the assumptions we make about truth sets. Most non-human organisms do not have access to gold standard truth set resources like the Platinum Genomes dataset. However there are a few alternative options you could try: - Genotyping arrays: if you have genotyping data for the same samples you tested your germline variant calling pipeline with, you can reformat these to VCF using a tool like [PLINK’s recode](https://www.cog-genomics.org/plink/1.9/data#recode) and use it as a truth set. - Known variant datasets: if your organism of interest has a set of known population-level variants you can use these as a truth-set. Just remember that these variants might not always be validated (i.e. dbSNP). Using this method you will need to also provide your own high-confidence regions file in BED format. The location and size of these regions will depend on your dataset, organism, reference assembly and sequencing method. Typically these regions would exclude centromeres, telomeres and repetitive parts of the genome that are likely to complicate variant calling. ## Additional notes Test data for Hap.py can be found [here](https://github.com/Illumina/hap.py/blob/master/doc/microbench.md) Instructions on how to install Hap.py can be found [here](https://github.com/Illumina/hap.py#installation) This warning may be thrown by Hap.py and can be ignored: `WARNING No reference file found at default locations. You can set the environment variable 'HGREF' or 'HG19' to point to a suitable Fasta file.` ### Understanding your outputs The following files will be produced and stored in your designated working directory. They will all be labelled with your specified cohort name. #### Variant based metrics Produced by BCFtools stats command. Output file: - ${cohort}.bcftools.metrics - ${cohort}_bcftools.metrics_vcfstatplots (directory and files) #### Sample based metrics Produced by BCFtools smplstats and mendelian commands. Output files: - ${cohort}.smplstats - ${cohort}.smplstats.pdf - ${cohort}.Mendelianerr #### Known variant concordance Produced by GATK CollectVariantCallingMetrics command. Output files: - ${cohort}.known.variant_calling_summary_metrics - ${cohort}.known.variant_calling_detail_metrics #### Biological validation using a truth set Produced by Hap.py. Output files: - ${sample}.happy.metrics.json.gz - ${sample}.happy.roc.all.csv.gz - ${sample}.happy.roc.Locations.INDEL.csv.gz - ${sample}.happy.roc.Locations.INDEL.PASS.csv.gz - ${sample}.happy.roc.Locations.SNP.csv.gz - ${sample}.happy.roc.Locations.SNP.PASS.csv.gz - ${sample}.happy.roc.tsv - ${sample}.happy.runinfo.json - ${sample}.happy.summary.csv ### Performance metrics explained |Metric |Expected/ideal value |Tool |Relevance | |--------------------------------------|----------------------------------------------------|---------------|---------------------------------------------------------------------------------------------------------------| |Number of SNVs and indels (per sample)|Human WGS: ~4.4M, Human WES: ~41k, Species dependent|bcftools stats |Population, sequencing approach, and genomic region dependent. Alone, this metric cannot indicate data quality.| |Indel length distribution |Indel length range is 1-10,000bp. |bcftools stats |Increased length is conflated with reduced mapping quality. Distribution is dataset dependent. Recommend filtering for high quality.| |Depth of coverage |Depends on the sequencing coverage of samples. |bcftools stats |Dramatic deviation from expected distribution can indicate artifactual bias. | |Substitution type counts |See TiTv ratio. |bcftools stats |Twice as many possible transversions as transitions. See [here](https://dx.doi.org/10.1093%2Fbioinformatics%2Fbtu668) | |TiTv ratio (genome wide) |For mammals: WGS: 2.0-2.1, WES: 3.0-3.3 |bcftools stats |Dramatic deviation from expected ratio can indicate artifactual bias. Typically elevated in coding regions where transversions are more likely to occur. | |Base quality distribution |Dataset dependent. |bcftools stats |This will reflect the quality based filtering you performed. Dramatic deviation from expected ratio can indicate artifactual bias.| |Indel ratio |Common: ~1.0, Rare: 0.2-0.5 |GATK CollectVariantCallingMetrics|This should be evaluated after custom filtering variants for your needs. Dramatic deviation from expected ratio can indicate artifactual bias.| |Het/hom(non-ref) |~2.0 assuming Hardy-Weinberg equilibrium. |GATK CollectVariantCallingMetrics|Ancestry dependent, can vary dramatically. See [Wang et al. 2015](https://dx.doi.org/10.1093%2Fbioinformatics%2Fbtu668)| |Mendelian error |0 |BCFtools +mendelian|Mendelian inheritance errors are likely erroneous genotype calls. See [Pilipenko et al. 2014](https://dx.doi.org/10.1186%2F1753-6561-8-S1-S21)| |True positives |Dataset dependent. |Hap.py |Number of query variants that are present in the truth set. | |False negatives |Dataset dependent. |Hap.py |Number of variants in truth set, not present in query VCF. | |False positives |Dataset dependent. |Hap.py |Number of variants in query VCF, not present in truth set. | |Recall |1 |Hap.py |Absence of false negatives. See [Krusche et al. 2019](https://doi.org/10.1038/s41587-019-0054-x) | |Precision |1 |Hap.py |Absence of false positives. See [Krusche et al. 2019](https://doi.org/10.1038/s41587-019-0054-x) | |F1-score |1 |Hap.py |Harmonic mean of recall and precision. See [Krusche et al. 2019](https://doi.org/10.1038/s41587-019-0054-x) | |Genotype errors (FP.GT) |Dataset dependent. |Hap.py |Number of query variants with incorrect genotype | ### Resources and references Eberle, M. A., Fritzilas, E., Krusche, P., Källberg, M., Moore, B. L., Bekritsky, M. A., Iqbal, Z., Chuang, H. Y., Humphray, S. J., Halpern, A. L., Kruglyak, S., Margulies, E. H., McVean, G., & Bentley, D. R. (2017). A reference data set of 5.4 million phased human variants validated by genetic inheritance from sequencing a three-generation 17-member pedigree. Genome research, 27(1), 157–164. https://doi.org/10.1101/gr.210500.116 Koboldt, D.C. Best practises for variant calling in clinical sequencing. Genome Med 12, 91 (2020). https://doi.org/10.1186/s13073-020-00791-w Krusche, P., Trigg, L., Boutros, P.C. et al. Best practices for benchmarking germline small-variant calls in human genomes. Nat Biotechnol 37, 555–560 (2019). https://doi.org/10.1038/s41587-019-0054-x Marshall, C.R., Chowdhury, S., Taft, R.J. et al. Best practices for the analytical validation of clinical whole-genome sequencing intended for the diagnosis of germline disease. npj Genom. Med. 5, 47 (2020). https://doi.org/10.1038/s41525-020-00154-9 Pilipenko, V.V., He, H., Kurowski, B.G. et al. Using Mendelian inheritance errors as quality control criteria in whole genome sequencing data set. BMC Proc 8, S21 (2014). https://doi.org/10.1186/1753-6561-8-S1-S21 Wang, J., Raskin, J., Samuels, D., Shyr, Y., Guo, Y., Genome measures used for quality control are dependent on gene function and ancestry, Bioinformatics 31, 318–323 (2015) https://doi.org/10.1093/bioinformatics/btu668 ## Help/FAQ/Troubleshooting If Hap.py throws an error, search the [issues at Hap.py GitHub repository](https://github.com/Illumina/hap.py/issues) and attempt to resolve it before submitting an issue here. ## Acknowledgements/citations/credits ### Authors - Georgie Samaha (Sydney Informatics Hub, University of Sydney) - Tracy Chew (Sydney Informatics Hub, University of Sydney) - Cali Willet (Sydney Informatics Hub, University of Sydney) - Nandan Deshpande (Sydney Informatics Hub, University of Sydney) Acknowledgements (and co-authorship, where appropriate) are an important way for us to demonstrate the value we bring to your research. Your research outcomes are vital for ongoing funding of the Sydney Informatics Hub and national compute facilities. We suggest including the following acknowledgement in any publications that follow from this work: The authors acknowledge the technical assistance provided by the Sydney Informatics Hub, a Core Research Facility of the University of Sydney and the Australian BioCommons which is enabled by NCRIS via Bioplatforms Australia. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.339.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "GermlineShortV_biovalidation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-06-17T06:38:32Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:42:33Z"^^ns1:Date ; ns1:description """ Author: AMBARISH KUMAR er.ambarish@gmail.com & ambari73_sit@jnu.ac.in This is a proposed standard operating procedure for genomic variant detection using GATK4. It is hoped to be effective and useful for getting SARS-CoV-2 genome variants. It uses Illumina RNASEQ reads and genome sequence. """ ; ns1:image ; ns1:input , , , , ; ns1:keywords "CWL, GATK4, SNPs, INDELs, SPARK" ; ns1:license ; ns1:name "Genomic variants - SNPs and INDELs detection using GATK4 spark based tools." ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_left_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaseq_right_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sars_cov_2_reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snp" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-05-10T23:42:38Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:52Z"^^ns1:Date ; ns1:description """# HiFi *de novo* genome assembly workflow HiFi-assembly-workflow is a bioinformatics pipeline that can be used to analyse Pacbio CCS reads for *de novo* genome assembly using PacBio Circular Consensus Sequencing (CCS) reads. This workflow is implemented in Nextflow and has 3 major sections. Please refer to the following documentation for detailed description of each workflow section: - [Pre-assembly quality control (QC)](https://github.com/AusARG/hifi-assembly-workflow/blob/master/recommendations.md#stage-1-pre-assembly-quality-control) - [Assembly](https://github.com/AusARG/hifi-assembly-workflow/blob/master/recommendations.md#stage-2-assembly) - [Post-assembly QC](https://github.com/AusARG/hifi-assembly-workflow/blob/master/recommendations.md#stage-3-post-assembly-quality-control) ## HiFi assembly workflow flowchart ![](https://github.com/AusARG/hifi-assembly-workflow/blob/master/workflow.png?raw=true) # Quick Usage: The pipeline has been tested on NCI Gadi and AGRF balder cluster. If needed to run on AGRF cluster, please contact us at bioinformatics@agrf.org.au. Please note for running this on NCI Gadi you need access. Please refer to Gadi guidelines for account creation and usage: these can be found at https://opus.nci.org.au/display/Help/Access. Here is an example that can be used to run a phased assembly on Gadi: ``` Module load nextflow/21.04.3 nextflow run Hifi_assembly.nf –bam_folder -profile gadi The workflow accepts 2 mandatory arguments: --bam_folder -- Full Path to the CCS bam files -profile -- gadi/balder/local ``` Please note that you can either run jobs interactively or submit jobs to the cluster. This is determined by the -profile flag. By passing the gadi tag to the profile argument, the jobs are submitted and run on the cluster. # General recommendations for using the HiFi *de novo* genome assembly workflow ## Example local profile usage ``` Start a screen, submit a job, and run the workflow Screen -S ‘name’ qsub -I -qnormal -Pwz54 -lwalltime=48:00:00,ncpus=4,mem=200GB,storage=scratch/wz54+gdata/wz54,wd export MODULEPATH=/apps/Modules/modulefiles:/g/data/wz54/groupResources/modules module load nextflow/21.04.3 nextflow run /g/data/wz54/groupResources/scripts/pl/hifi_assembly.nf --bam_folder -profile local #This load the scripts directory to the environmental PATH and load nextflow module module load hifi_assembly/1.0.0 ``` # Outputs Pipeline generates various files and folders here is a brief description: The pipeline creates a folder called `secondary_analysis` that contains two sub folders named: - `exeReport` - `Results` -- Contains preQC, assembly and postQC analysis files ## exeReport This folder contains a computation resource usage summary in various charts and a text file. `report.html` provides a comprehensive summary. ## Results The `Results` folder contains three sub-directories preQC, assembly and postqc. As the name suggests, outputs from the respective workflow sections are placed in each of these folders. ### preQC The following table contains list of files and folder from preQC results | Output folder/file | File | Description | | ------------------ | ---------------- | ------------------------------------------------------------------------------ | | .fa | | Bam files converted to fasta format | | kmer\\_analysis | | Folder containing kmer analysis outputs | | | .jf | k-mer counts from each sample | | | .histo | histogram of k-mer occurrence | | genome\\_profiling | | genomescope profiling outputs | | | summary.txt | Summary metrics of genome scope outputs | | | linear\\_plot.png | Plot showing no. of times a k-mer observed by no. of k-mers with that coverage | ### Assembly This folder contains final assembly results in format. - `_primary.fa` - Fasta file containing primary contigs - `_associate.fa` - Fasta file containing associated contigs ### postqc The postqc folder contains two sub folders - `assembly_completeness` - `assembly_evaluation` #### assembly_completeness This contains BUSCO evaluation results for primary and associate contig. #### assembly_evaluation Assembly evaluation folder contains various file formats, here is a brief description for each of the outputs. | File | Description | | ----------- | ----------------------------------------------------------------------------------------- | | report.txt | Assessment summary in plain text format | | report.tsv | Tab-separated version of the summary, suitable for spreadsheets (Google Docs, Excel, etc) | | report.tex | LaTeX version of the summary | | icarus.html | Icarus main menu with links to interactive viewers | | report.html | HTML version of the report with interactive plots inside | # Infrastructure usage and recommendations ### NCI facility access One should have a user account set with NCI to access gadi high performance computational facility. Setting up a NCI account is mentioned in detail at the following URL: https://opus.nci.org.au/display/Help/Setting+up+your+NCI+Account Documentation for a specific infrastructure should go into a infrastructure documentation template https://github.com/AustralianBioCommons/doc_guidelines/blob/master/infrastructure_optimisation.md ## Compute resource usage across tested infrastructures | | Computational resource for plant case study | | ------------------------------------- | ------------------------------------------- | | | Time | CPU | Memory | I/O | | Process | duration | realtime | %cpu | peak\\_rss | peak\\_vmem | rchar | wchar | | Converting bam to fasta for sample | 12m 54s | 12m 48s | 99.80% | 5.2 MB | 197.7 MB | 43.3 GB | 50.1 GB | | Generating k-mer counts and histogram | 26m 43s | 26m 36s | 1725.30% | 19.5 GB | 21 GB | 77.2 GB | 27.1 GB | | Profiling genome characteristics | 34.7s | 13.2s | 89.00% | 135 MB | 601.2 MB | 8.5 MB | 845.9 KB | | Denovo assembly | 6h 51m 15s | 6h 51m 11s | 4744.40% | 84.7 GB | 225.6 GB | 1.4 TB | 456 GB | | evaluate\\_assemblies | 5m 18s | 4m 54s | 98.20% | 1.6 GB | 1.9 GB | 13.6 GB | 2.8 GB | | assemblies\\_completeness | 25m 57s | 25m 53s | 2624.20% | 22 GB | 25.2 GB | 624.9 GB | 2.9 GB | | | Computational resource for bird case study | | ------------------------------------- | ------------------------------------------ | | | Time | CPU | Memory | I/O | | Process | duration | realtime | %cpu | peak\\_rss | peak\\_vmem | rchar | wchar | | Converting bam to fasta for sample | 12m 54s | 7m 9s | 86.40% | 5.2 MB | 197.8 MB | 21.5 GB | 27.4 GB | | Generating k-mer counts and histogram | 26m 43s | 15m 34s | 1687.70% | 10.1 GB | 11.7 GB | 44 GB | 16.6 GB | | Profiling genome characteristics | 34.7s | 1m 15s | 15.30% | 181.7 MB | 562.2 MB | 8.5 MB | 819.1 KB | | De novo assembly | 6h 51m 15s | 9h 2m 47s | 1853.50% | 67.3 GB | 98.4 GB | 1 TB | 395.6 GB | | evaluate assemblies | 5m 18s | 2m 48s | 97.50% | 1.1 GB | 1.4 GB | 8.7 GB | 1.8 GB | | assemblies completeness | 25m 57s | 22m 36s | 2144.00% | 22.2 GB | 25 GB | 389.7 GB | 1.4 GB | # Workflow summaries ## Metadata | Metadata field | Pre-assembly quality control | Primary assembly | Post-assembly quality control | | ---------------- | --------------------------------------------------------------------------------- | ------------------ | ----------------------------- | | Version | 1.0 | 1.0 | 1.0 | | Maturity | Production | Production | production | | Creators | Naga, Kenneth | Naga, Kenneth | Naga, Kenneth | | Source | [AusARG/hifi-assembly-workflow](https://github.com/AusARG/hifi-assembly-workflow) | | License | MIT License | MIT License | MIT License | | Workflow manager | NextFlow | NextFlow | NextFlow | | Container | No containers used | No containers used | No containers used | | Install method | Manual | Manual | Manual | ## Component tools ​ | Workflow element | Workflow element version | Workflow title | | --------------------------------- | ------------------------ | ----------------------------- | | Samtools, jellyfish, genomescope | 1.0 | Pre-assembly quality control | | Improved phased assembler (pbipa) | 1.0 | Primary assembly | | Quast and busco | 1.0 | Post-assembly quality control | ## Required (minimum) inputs/parameters PATH to HIFI bam folder is the minimum requirement for the processing the pipeline. ## Third party tools / dependencies The following packages are used by the pipeline. - `nextflow/21.04.3` - `samtools/1.12` - `jellyfish/2.3.0` - `genomescope/2.0` - `ipa/1.3.1` - `quast/5.0.2` - `busco/5.2.2` The following paths contain all modules required for the pipeline. - `/apps/Modules/modulefiles` - `/g/data/wz54/groupResources/modules` --- # Help/FAQ/Troubleshooting Direct training and help is available if you are new to HPC and/or new to NCI/Gadi. - Basic information to get started with the NCI Gadi for bioinformatics can be found at https://github.com/AusARG/ABLeS/wiki/temppage. - For NCI support, contact the NCI helpdesk directly at https://www.nci.org.au/users/nci-helpdesk - Queue limits and structure explained at https://opus.nci.org.au/display/Help/4.+PBS+Jobs --- # 3rd party Tutorials A tutorial by Andrew Severin on running GenomeScope 1.0 is available here: https://github.com/AusARG/hifi-assembly-workflow.git Improved Phased Assembler tutorial is available at https://github.com/PacificBiosciences/pbbioconda/wiki/Improved-Phased-Assembler Busco tutorial https://wurmlab.com/genomicscourse/2016-SIB/practicals/busco/busco_tutorial --- # Licence(s) MIT License Copyright (c) 2022 AusARG Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- # Acknowledgements/citations/credits > Jung, H. et al. Twelve quick steps for genome assembly and annotation in the classroom. PLoS Comput. Biol. 16, 1–25 (2020). > 2020, G. A. W. No Title. https://ucdavis-bioinformatics-training.github.io/2020-Genome_Assembly_Workshop/kmers/kmers. > Sović, I. et al. Improved Phased Assembly using HiFi Data. (2020). > Gurevich, A., Saveliev, V., Vyahhi, N. & Tesler, G. QUAST: Quality assessment tool for genome assemblies. Bioinformatics 29, 1072–1075 (2013). > Waterhouse, R. M. et al. BUSCO applications from quality assessments to gene prediction and phylogenomics. Mol. Biol. Evol. 35, 543–548 (2018). --- """ ; ns1:keywords "" ; ns1:license ; ns1:name "HiFi de novo genome assembly workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "KNIME" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-20T09:38:29Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:52Z"^^ns1:Date ; ns1:description "A workflow for the quality assessment of mass spectrometry (MS) based proteomics analyses" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.343.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "MaCProQC" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference filters files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output Destination" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filter rRNA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto index file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum memory in MB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-05-20T10:10:26Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:59:53Z"^^ns1:Date ; ns1:description """### Workflow Kallisto RNAseq **(pseudoalignment on transcripts)** - Workflow Illumina Quality: https://workflowhub.eu/workflows/336?version=1 - kallisto **All tool CWL files and other workflows can be found here:**
Tools: https://git.wur.nl/unlock/cwl/-/tree/master/cwl
Workflows: https://git.wur.nl/unlock/cwl/-/tree/master/cwl/workflows **How to setup and use an UNLOCK workflow:**
https://m-unlock.gitlab.io/docs/setup/setup.html """ ; ns1:image ; ns1:input , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Kallisto RNAseq Workflow" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Demultiplexed reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cutadapt on input dataset(s): Read 1 Output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC on input dataset(s): RawData" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FastQC on input dataset(s): Webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC on input dataset(s): Stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC on input dataset(s): Webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp on input dataset(s): HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp on input dataset(s): Read 1 output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T06:55:21Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-qc-of-radseq-reads These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ ## Inputs * demultiplexed reads in fastq format, in a collection * two adapter sequences in fasta format, for input into cutadapt ## Steps and outputs The workflow can be modified to suit your own parameters. The workflow steps are: * Run FastQC to get statistics on the raw reads, send to MultiQC to create a nice output. This is tagged as "Report 1" in the Galaxy history. * Run Cutadapt on the reads to cut adapters - enter two files with adapter sequence at the workflow option for "Choose file containing 3' adapters". The default settings are on except that the "Maximum error rate" for the adapters is set to 0.2 instead of 0.1. Send output statistics to MulitQC, this is "Report 2" in the Galaxy history. Note that you may have different requirements here in terms of how many adapter sequences you want to enter. We recommend copying the workflow and modifying as needed. * Send these reads to fastp for additional filtering or trimming. Default settings are on but can be modified as needed. Send output statistics to MultiQC, this is "Report 3" in the Galaxy history. * The filtered and trimmed reads are then ready for the stacks workflows. ![qc-wf](wf-image-qc.png) """ ; ns1:image ; ns1:input ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "QC of RADseq reads" ; ns1:output , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T07:20:38Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-ref-guided-stacks These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ ## Inputs * demultiplexed reads in fastq format, may be output from the QC workflow. Files are in a collection. * population map in text format * reference genome in fasta format ## Steps and outputs BWA MEM 2: * The reads are mapped to the reference genome; output in BAM format * The collection of bam files is named something like Map with BWA-MEM on collection 5 (mapped reads in BAM format) * Each of the bam files in the collection is named something like sample_CAAC Samtools stats before filtering: * These bam files are sent to Samtools stats to get statistics; these are then sent to MultiQC to provide a nice output. This is tagged as "bam stats before filtering" in the Galaxy history. * The "General Statistics" show how many reads were mapped - if there is a low mapping rate, it may be worth re-checking or repeating QC on the raw reads, or considering a different reference genome, or using a de novo approach. To see if many reads have been soft-clipped by Bwa mem (which may affect how well gstacks can work), look at the "Alignment Metrics" section, and the row with "Mapped bases (Cigar)". Hover over the dots to see sample names especially towards the left of the row - these have the least mapped reads. Samtools view: * This step filters out certain reads from the bam files. The default settings are to exclude reads if they are unmapped, if the alignment is not primary or is supplementary, if the read fails platform/vendor quality checks, and if the read is a PCR or optical duplicate. * The output bams are tagged with "filtered bams" in the Galaxy history. Samtools stats after filtering: * Filtered bams are sent again to samtools stats, and statistics to MultiQC, with the report tagged as "bam stats after filtering" in the Galaxy history. gstacks: * Filtered bams and a population map are sent to gstacks. The outputs are: * Catalog of loci in fasta format * Variant calls in VCF format * Note: some bam files cause errors here with gstacks. For example, the log file may say "Error, all records discard with file SampleXYZ.FASTQ.bam, Aborted". If this occurs, check the bam stats (as described above). Some of the options are to re-do QC on the raw reads, change settings for mapping reads in BWA MEM, and/or delete this sample/s from the population map and proceed to gstacks. The sample can still remain in the list of bam files but gstacks will only consider what is listed in the pop map. populations: * gstacks outputs and a population map are snet to the "populations" module. The outputs are: * Locus consensus sequences in fasta format * Snp calls, in VCF format * Haplotypes, in VCF format * Summary statistics ![qc-wf](wf-ref-guided.png) """ ; ns1:image ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Stacks RAD-seq reference-guided workflow" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T07:39:10Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-denovo-stacks These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ ## Inputs * demultiplexed reads in fastq format, may be output from the QC workflow. Files are in a collection. * population map in text format ## Steps and outputs ustacks: * input reads go to ustacks. * ustacks assembles the reads into matching stacks (hypothetical alleles). * The outputs are in a collection called something like: Stacks2: ustacks on data 21, data 20, and others Loci and polymorphism. Click on this to see the files: * for each sample, assembled loci (tsv format), named e.g. sample_CAAC.tags * for each sample, model calls from each locus (tsv format), named e.g. sample_CAAC.snps * for each sample, haplotypes/alleles recorded from each locus (tsv format), named e.g. sample_CAAC.alleles * Please see sections 6.1 to 6.4 in https://catchenlab.life.illinois.edu/stacks/manual/#ufiles for a full description. cstacks: * cstacks will merge stacks into a catalog of consensus loci. * The outputs are in a collection called something like Stacks2: cstacks on data 3, data 71, and others Catalog of loci. Click on this to see the three files, each in tsv format: catalog.tags catalog.snps catalog.alleles sstacks: * sstacks will compare each sample to the loci in the catalog. * The outputs are in a collection called something like Stacks2: sstacks on data 3, data 76, and others Matches to the catalog.Click on this to see the files: There is one file for each sample, named e.g. sample_CAAC.matches, in tsv format. tsv2bam: * Conversion to BAM format * Reads from each sample are now aligned to each locus, and the tsv2bam tool will convert this into a bam file for each sample. * The outputs are in a collection called something like Stacks2: tsv2bam on data 3, data 94, and others Matches to the catalog.Click on this to see the files: There is one file for each sample, named e.g sample_CAAC.matches, in BAM format. gstacks: * Catalog of loci in fasta format * Variant calls in VCF format populations: * Locus consensus sequences in fasta format * Snp calls, in VCF format * Haplotypes, in VCF format * Summary statistics ![denovo](wf-denovo.png) """ ; ns1:image ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Stacks RAD-seq de novo workflow" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T07:50:13Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-partial-ustacks-only These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ For the full de novo workflow see https://workflowhub.eu/workflows/348 You may want to run ustacks with different batches of samples. * To be able to combine these later, there are some necessary steps - we need to keep track of how many samples have already run in ustacks, so that new samples can be labelled with different identifying numbers. * In ustacks, under "Processing options" there is an option called "Start identifier at". * The default for this is 1, which can be used for the first batch of samples. These will then be labelled as sample 1, sample 2 and so on. * For any new batches of samples to process in ustacks, we will want to start numbering these at the next available number. e.g. if there were 10 samples in batch 1, this should then be set to start at 11. To combine multiple outputs from ustacks, providing these have been given appropriate starting identifiers: * Find the ustacks output in the Galaxy history. This will be a list of samples. * Click on the cross button next to the filename to delete, but select "Collection only". This releases the items from the list, but they will now be hidden in the Galaxy history. * In the history panel, click on "hidden" to reveal any hidden files. Unhide the samples. * Do this for all the batches of ustacks outputs that are needed. * Click on the tick button, tick all the samples needed, then "For all selected" choose "Build dataset list" * This is now a combined set of samples for input into cstacks. """ ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Partial de novo workflow: ustacks only" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-17T07:41:06Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:42:36Z"^^ns1:Date ; ns1:description """ Author: AMBARISH KUMAR er.ambarish@gmail.com; ambari73_sit@jnu.ac.in This is a proposed standard operating procedure for genomic variant detection using SAMTools. It is hoped to be effective and useful for getting SARS-CoV-2 genome variants. It uses Illumina RNASEQ reads and genome sequence. """ ; ns1:image ; ns1:input , , , ; ns1:keywords "CWL, SAMTools, SNPs, INDELs, covid-19" ; ns1:license ; ns1:name "Genomic variants - SNPs and INDELs detection using SAMTools." ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Refrence genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VCF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Population map" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ustacks inputs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: cstacks on input dataset(s) Catalog of loci" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: cstacks on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: gstacks on input dataset(s) Assembled contigs and variant sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: gstacks on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Population-level haplotype summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Population-level summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Populations log distributions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Raw Genotypes/Haplotypes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Summary of Population-level summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: sstacks on input dataset(s) Matches to the catalog" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: sstacks on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: tsv2bam on input dataset(s) Matches to the catalog (bam)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: tsv2bam on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T07:56:39Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-partial-cstacks-sstacks-gstacks These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ This workflow takes in ustacks output, and runs cstacks, sstacks and gstacks. To generate ustacks output see https://workflowhub.eu/workflows/349 For the full de novo workflow see https://workflowhub.eu/workflows/348 """ ; ns1:input , ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Partial de novo workflow: c-s-g-pops only" ; ns1:output , , , , , , , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T08:05:01Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-partial-bwa-mem These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ This workflow is part of the reference-guided stacks workflow, https://workflowhub.eu/workflows/347 Inputs * demultiplexed reads in fastq format, may be output from the QC workflow. Files are in a collection. * reference genome in fasta format Outputs * A set of filtered bam files, ready for the next part of the stacks workflow (e.g. gstacks). * Statistics on the bam files. """ ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Partial ref-guided workflow - bwa mem only" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bam files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Population map" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: gstacks on input dataset(s) Assembled contigs and variant sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: gstacks on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Population-level haplotype summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Population-level summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Populations log distributions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Raw Genotypes/Haplotypes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) Summary of Population-level summary statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stacks2: populations on input dataset(s) log file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-05-31T08:12:30Z"^^ns1:Date ; ns1:dateModified "2023-01-30T18:21:31Z"^^ns1:Date ; ns1:description """# workflow-partial-gstacks-populations These workflows are part of a set designed to work for RAD-seq data on the Galaxy platform, using the tools from the Stacks program. Galaxy Australia: https://usegalaxy.org.au/ Stacks: http://catchenlab.life.illinois.edu/stacks/ This workflow is part of the reference-guided stacks workflow, https://workflowhub.eu/workflows/347 This workflow takes in bam files and a population map. To generate bam files see: https://workflowhub.eu/workflows/351 """ ; ns1:input , ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Partial ref-guided workflow - gstacks and pops" ; ns1:output , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenBank genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Fasta sequences for " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Map with BWA-MEM on input dataset(s) (mapped reads in BAM format)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SnpEff eff: on input dataset(s) - stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SnpEff4.3 database for " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "called_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pe" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapped_reads_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "markduplicates_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "markduplicates_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "realigned_deduplicated_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "realigned_deduplicated_filtered_mapped_reads_with_indel_quals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "soft_filtered_variants" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-01T12:46:37Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:44Z"^^ns1:Date ; ns1:description """# Generic variant calling A generic workflow for identification of variants in a haploid genome such as genomes of bacteria or viruses. It can be readily used on MonkeyPox. The workflow accepts two inputs: - A genbank file with the reference genomes - A collection of paired fastqsanger files The workflow outputs a collection of VCF files for each sample (each fastq pair). These VCF files serve as input to the [Reporting workflow](https://workflowhub.eu/workflows/354). Workflow can be accessed directly on [usegalaxy.org](https://usegalaxy.org/u/aun1/w/generic-variation-analysis-on-wgs-pe-data) The general idea of the workflow is: ![](https://i.imgur.com/rk40Y4t.png)""" ; ns1:input , ; ns1:keywords "mpxv, generic" ; ns1:license ; ns1:name "Generic variation analysis on WGS PE data" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "AF Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DP Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DP_ALT Filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variation data to report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "af_recalculated" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "all_variants_all_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "by_variant_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cleaned_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "collapsed_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_variant_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_extracted_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "highest_impact_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "prefiltered_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "processed_variants_collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "variants_for_plotting" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-01T15:36:06Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:00:24Z"^^ns1:Date ; ns1:description """# Generic variation analysis reporting This workflow generates reports from a list of variants generated by [Variant Calling Workflow](https://workflowhub.eu/workflows/353). The workflow accepts a single input: - A collection of VCF files The workflow produces two outputs (format description below): 1. A list of variants grouped by Sample 2. A list of variants grouped by Variant Here is example of output **by sample**. In this table all varinats in all samples are epxlicitrly listed: | Sample | POS | FILTER | REF | ALT | DP | AF | AFcaller | SB | DP4 | IMPACT | FUNCLASS | EFFECT | GENE | CODON | AA | TRID | min(AF) | max(AF) | countunique(change) | countunique(FUNCLASS) | change | |----------|------|----------|---------|-----|-----|------|-----------|-----|-------|----------|---------------|-------------|--------|-------------| ---|--------|----------|-----------|-------------------------|------------------------------|------------| | ERR3485786 | 11644 | PASS | A | G | 97 | 0.979381 | 0.907216 | 0 | 1,1,49,46 | LOW | SILENT | SYNONYMOUS_CODING | D7L | tgT/tgC | C512 | AKG51361.1 | 0.979381 | 1 | 1 | 1 | A>G | | ERR3485786 | 11904 | PASS | T | C | 102 | 0.990196 | 0.95098 | 0 | 0,0,51,50 | MODERATE | MISSENSE | NON_SYNONYMOUS_CODING | D7L | Act/Gct | T426A | AKG51361.1 | 0.990196 | 1 | 1 | 1 | T>C | > **Note** the two alernative allele frequency fields: "AFcaller" ans "AF". LoFreq reports AF values listed in "AFcaller". They incorrect due to the known LoFreq [bug](https://github.com/CSB5/lofreq/issues/80). To correct for this we are recomputing AF values from DP4 and DP fields as follows: `AF == (DP4[2] + DP4[3]) / DP.` Here is an example of output **by variant**. In this table data is aggregated by variant across all samples in which this variant is present: | POS | REF | ALT | IMPACT | FUNCLASS | EFFECT | GENE | CODON | AA | TRID | countunique(Sample) | min(AF) | max(AF) | SAMPLES(above-thresholds) | SAMPLES(all) | AFs(all) | change | |-----|-------|-----|-----------|----------------|------------|----------|-----------|------|--------|------------------------|----------|-----------|------------------------------------|------------------|----------|---------| | 11644 | A | G | LOW | SILENT | SYNONYMOUS_CODING | D7L | tgT/tgC | C512 | AKG51361.1 | 11 | 0.979381 | 1 | ERR3485786,ERR3485787... | ERR3485786,ERR3485787,ERR3485789 ... | 0.979381,1.0... | A>G | | 11904 | T | C | MODERATE | MISSENSE | NON_SYNONYMOUS_CODING | D7L | Act/Gct | T426A | AKG51361.1 | 12 | 0.990196 | 1 | ERR3485786,ERR3485787... | ERR3485786,ERR3485787,ERR3485789... | 0.990196,1.0,1.0... | T>C | The workflow can be accessed at [usegalaxy.org](https://usegalaxy.org/u/aun1/w/genetic-variation-analysis-reporting) The general idea of the workflow is: ![](https://i.imgur.com/k2cIZK5.png) """ ; ns1:input , , , ; ns1:keywords "mpvx, generic" ; ns1:license ; ns1:name "Generic variation analysis reporting" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Depth-threshold for masking" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variant calls" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "aligned reads data for depth calculation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min-AF for consensus variant" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "min-AF for failed variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_based_masking_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "called_variant_sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_called_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_called_variants_with_0_based_start" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_called_variants_with_0_based_start_end" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_failed_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_failed_variants_with_0_based_start" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chrom_pos_ref_failed_variants_with_0_based_start_end" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "consensus_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "coverage_depth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "failed_variant_sites" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filter_failed_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "low_cov_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "low_cov_regions_plus_filter_failed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "low_cov_regions_plus_filter_failed_combined" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "masking_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "masking_regions_with_1_based_start" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multisample_consensus_fasta" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-03T09:33:10Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:44Z"^^ns1:Date ; ns1:description """# Generic consensus building This workflow generates consensus sequences using a list of variants generated by [Variant Calling Workflow](https://workflowhub.eu/workflows/353). The workflow accepts a single input: - A collection of VCF files The workflow produces a single output: - Consensus sequence for each input VCF file The workflow can be accessed at [usegalaxy.org](https://usegalaxy.org/u/aun1/w/consensus-construction)""" ; ns1:input , , , , , ; ns1:keywords "mlxv, generic" ; ns1:license ; ns1:name "Generic consensus construction from VCF calls" ; ns1:output , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenBank genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Name for genome database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Paired Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Fasta sequences for genbank file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SnpEff eff: stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SnpEff variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SnpEff4.3 database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bwa_mem_alignments" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/called_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastp_pe" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered_alignment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapped_reads_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/markduplicates_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/preprocessing_and_mapping_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_filtered_mapped_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/realigned_deduplicated_filtered_mapped_reads_with_indel_quals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/soft_filtered_variants" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-04T02:01:11Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:33Z"^^ns1:Date ; ns1:description """Generic variation analysis on WGS PE data ------------------------------------------- This workflows performs paired end read mapping with bwa-mem followed by sensitive variant calling across a wide range of AFs with lofreq and variant annotation with snpEff. The reference genome can be provided as a GenBank file. """ ; ns1:input , , ; ns1:keywords "mpxv, generic" ; ns1:license ; ns1:name "generic-variant-calling-wgs-pe/main" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Miguel Roncoroni" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input genomes as collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ClustalW on input dataset(s): clustal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Proteinortho on input dataset(s): orthology-groups" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Proteinortho_extract_by_orthogroup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "extracted_ORFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fasta_header_cleaned" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "funannotate_predicted_proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "headers_shortened" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "proteomes_to_one_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "repeat_masked" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sample_names_to_headers" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-06T14:05:14Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:44Z"^^ns1:Date ; ns1:description "This workflow begins from a set of genome assemblies of different samples, strains, species. The genome is first annotated with Funnanotate. Predicted proteins are furtner annotated with Busco. Next, 'ProteinOrtho' finds orthologs across the samples and makes orthogroups. Orthogroups where all samples are represented are extracted. Orthologs in each orthogroup are aligned with ClustalW. Test dataset: https://zenodo.org/record/6610704#.Ypn3FzlBw5k" ; ns1:input ; ns1:keywords "phylogenetics, phylogenomics, Annotation" ; ns1:license ; ns1:name "preparing genomic data for phylogeny recostruction (GTN)" ; ns1:output , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Miguel Roncoroni" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input alignment collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "A partition file ready for input into RAxML or IQ-tree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "An occupancy file that summarizes the taxon occupancy per sequence" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Astral log." . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Concatenated fasta alignment file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): BIONJ Tree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): Consensus Tree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): MaxLikelihood Distance Matrix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): MaxLikelihood Tree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): Occurence Frequencies in Bootstrap Trees" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "IQ-TREE on input dataset(s): Report and Final Tree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output tree file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trimmed alignment." . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-06-06T14:14:25Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:44Z"^^ns1:Date ; ns1:description """Phylogenetic reconstruction using genome-wide and single-gene alignment data. Here we use maximum likelihood reconstruction program IQTree. Data can be prepared using the [phylogenetic data preparation workflow](http://workflowhub.eu/workflows/358) prior to phylogenetic reconstruction. Resulting trees can be viewed interactively using Galaxy's 'Phyloviz' or 'Phylogenetic Tree Visualization'""" ; ns1:input ; ns1:keywords "phylogenetics, phylogenomics" ; ns1:license ; ns1:name "ML phylogenetic reconstruction" ; ns1:output , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T15:49:59Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Detects SNPs and INDELs." ; ns1:image ; ns1:input , , ; ns1:keywords "Galaxy, SNPs, INDELs, GATK4" ; ns1:license ; ns1:name "COVID-19: GATK4" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name "Alex L Mitchell" . a ns1:Person ; ns1:name "Alex Shlemov" . a ns1:Person ; ns1:name "Alexandre Almeida" . a ns1:Person ; ns1:name "Alla Lapidus" . a ns1:Person ; ns1:name "Anton Korobeynikov" . a ns1:Person ; ns1:name "Ekaterina Sakharova" . a ns1:Person ; ns1:name "Guy Cochrane" . a ns1:Person ; ns1:name "Josephine Burgin" . a ns1:Person ; ns1:name "Lorna J Richardson" . a ns1:Person ; ns1:name "Martin Beracochea" . a ns1:Person ; ns1:name "Maxim Scheremetjew" . a ns1:Person ; ns1:name "Michael R Crusoe" . a ns1:Person ; ns1:name "Miguel Boland" . a ns1:Person ; ns1:name "Olga Kunyavskaya" . a ns1:Person ; ns1:name "Robert D Finn" . a ns1:Person ; ns1:name "Simon C Potter" . a ns1:Person ; ns1:name "Varsha Kale" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5.8s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CGC_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CGC_postfixes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_data_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_diamond_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_gathering_bit_score" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_name_database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_omit_alignment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_applications" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_databases" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_outputFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Uniref90_db_txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cgc_chunk_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clusters_glossary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "contig_min_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "diamond_databaseFile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "diamond_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "diamond_maxTargetSeqs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "func_ann_names_hmmer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "func_ann_names_ips" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "go_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gp_flatfiles_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "graphs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hmmsearch_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ips_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ko_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "other_ncrna_models" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pathways_classes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pathways_names" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "protein_chunk_size_IPS" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "protein_chunk_size_eggnog" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "protein_chunk_size_hmm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_model_clans" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_models" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bgzip_fasta_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bgzip_index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chunking_nucleotides" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chunking_proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "completed_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "compressed_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "functional_annotation_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hashsum_input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "index_fasta_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_cds_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_tax_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pathways_systems_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pathways_systems_folder_antismash" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pathways_systems_folder_antismash_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-statistics_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-status" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rna-count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sequence-categorisation_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "taxonomy-summary_folder" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , ; ns1:dateCreated "2022-06-07T08:03:35Z"^^ns1:Date ; ns1:dateModified "2023-04-28T10:09:02Z"^^ns1:Date ; ns1:description """MGnify (http://www.ebi.ac.uk/metagenomics) provides a free to use platform for the assembly, analysis and archiving of microbiome data derived from sequencing microbial populations that are present in particular environments. Over the past 2 years, MGnify (formerly EBI Metagenomics) has more than doubled the number of publicly available analysed datasets held within the resource. Recently, an updated approach to data analysis has been unveiled (version 5.0), replacing the previous single pipeline with multiple analysis pipelines that are tailored according to the input data, and that are formally described using the Common Workflow Language, enabling greater provenance, reusability, and reproducibility. MGnify's new analysis pipelines offer additional approaches for taxonomic assertions based on ribosomal internal transcribed spacer regions (ITS1/2) and expanded protein functional annotations. Biochemical pathways and systems predictions have also been added for assembled contigs. MGnify's growing focus on the assembly of metagenomic data has also seen the number of datasets it has assembled and analysed increase six-fold. The non-redundant protein database constructed from the proteins encoded by these assemblies now exceeds 1 billion sequences. Meanwhile, a newly developed contig viewer provides fine-grained visualisation of the assembled contigs and their enriched annotations. Documentation: https://docs.mgnify.org/en/latest/analysis.html#assembly-analysis-pipeline """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Metagenomics, Annotation, workflow, CWL" ; ns1:license ; ns1:name "MGnify - assembly analysis pipeline" ; ns1:output , , , , , , , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Alex L Mitchell" . a ns1:Person ; ns1:name "Alex Shlemov" . a ns1:Person ; ns1:name "Alexandre Almeida" . a ns1:Person ; ns1:name "Alla Lapidus" . a ns1:Person ; ns1:name "Anton Korobeynikov" . a ns1:Person ; ns1:name "Ekaterina Sakharova" . a ns1:Person ; ns1:name "Guy Cochrane" . a ns1:Person ; ns1:name "Josephine Burgin" . a ns1:Person ; ns1:name "Lorna J Richardson" . a ns1:Person ; ns1:name "Martin Beracochea" . a ns1:Person ; ns1:name "Maxim Scheremetjew" . a ns1:Person ; ns1:name "Michael R Crusoe" . a ns1:Person ; ns1:name "Miguel Boland" . a ns1:Person ; ns1:name "Olga Kunyavskaya" . a ns1:Person ; ns1:name "Robert D Finn" . a ns1:Person ; ns1:name "Simon C Potter" . a ns1:Person ; ns1:name "Varsha Kale" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5.8s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "itsonedb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "itsonedb_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "itsonedb_otu_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "itsonedb_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc_min_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_model_clans" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_models" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "single_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stats_file_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "unite_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "unite_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "unite_otu_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "unite_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ITS-length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "completed_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_filtering_json_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gz_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hashsum_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hashsum_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_tax_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-status" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rna-count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sequence-categorisation_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "suppressed_upload" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "taxonomy-summary_folder" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , ; ns1:dateCreated "2022-06-07T08:28:11Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:01:07Z"^^ns1:Date ; ns1:description """MGnify (http://www.ebi.ac.uk/metagenomics) provides a free to use platform for the assembly, analysis and archiving of microbiome data derived from sequencing microbial populations that are present in particular environments. Over the past 2 years, MGnify (formerly EBI Metagenomics) has more than doubled the number of publicly available analysed datasets held within the resource. Recently, an updated approach to data analysis has been unveiled (version 5.0), replacing the previous single pipeline with multiple analysis pipelines that are tailored according to the input data, and that are formally described using the Common Workflow Language, enabling greater provenance, reusability, and reproducibility. MGnify's new analysis pipelines offer additional approaches for taxonomic assertions based on ribosomal internal transcribed spacer regions (ITS1/2) and expanded protein functional annotations. Biochemical pathways and systems predictions have also been added for assembled contigs. MGnify's growing focus on the assembly of metagenomic data has also seen the number of datasets it has assembled and analysed increase six-fold. The non-redundant protein database constructed from the proteins encoded by these assemblies now exceeds 1 billion sequences. Meanwhile, a newly developed contig viewer provides fine-grained visualisation of the assembled contigs and their enriched annotations. Documentation: https://docs.mgnify.org/en/latest/analysis.html#amplicon-analysis-pipeline """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "CWL, Metagenomics, rna, workflow" ; ns1:license ; ns1:name "MGnify - amplicon analysis pipeline" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Alex L Mitchell" . a ns1:Person ; ns1:name "Alex Shlemov" . a ns1:Person ; ns1:name "Alexandre Almeida" . a ns1:Person ; ns1:name "Alla Lapidus" . a ns1:Person ; ns1:name "Anton Korobeynikov" . a ns1:Person ; ns1:name "Ekaterina Sakharova" . a ns1:Person ; ns1:name "Guy Cochrane" . a ns1:Person ; ns1:name "Josephine Burgin" . a ns1:Person ; ns1:name "Lorna J Richardson" . a ns1:Person ; ns1:name "Martin Beracochea" . a ns1:Person ; ns1:name "Maxim Scheremetjew" . a ns1:Person ; ns1:name "Michael R Crusoe" . a ns1:Person ; ns1:name "Miguel Boland" . a ns1:Person ; ns1:name "Olga Kunyavskaya" . a ns1:Person ; ns1:name "Robert D Finn" . a ns1:Person ; ns1:name "Simon C Potter" . a ns1:Person ; ns1:name "Varsha Kale" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5.8s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "5s_pattern" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CGC_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CGC_postfixes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_data_dir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EggNOG_diamond_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_gathering_bit_score" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_name_database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HMM_omit_alignment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_applications" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_databases" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan_outputFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cgc_chunk_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "func_ann_names_hmmer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "func_ann_names_ips" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "go_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hmmsearch_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ips_header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ko_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lsu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "other_ncRNA_models" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "protein_chunk_size_IPS" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "protein_chunk_size_hmm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc_min_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_model_clans" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rfam_models" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "single_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_label" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_otus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ssu_tax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chunking_nucleotides" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "chunking_proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "completed_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "compressed_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_filtering_json_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "functional_annotation_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hashsum_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hashsum_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "motus_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_cds_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "no_tax_flag_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc-status" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qc_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rna-count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sequence-categorisation_folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "taxonomy-summary_folder" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2022-06-07T08:40:40Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:01:12Z"^^ns1:Date ; ns1:description """MGnify (http://www.ebi.ac.uk/metagenomics) provides a free to use platform for the assembly, analysis and archiving of microbiome data derived from sequencing microbial populations that are present in particular environments. Over the past 2 years, MGnify (formerly EBI Metagenomics) has more than doubled the number of publicly available analysed datasets held within the resource. Recently, an updated approach to data analysis has been unveiled (version 5.0), replacing the previous single pipeline with multiple analysis pipelines that are tailored according to the input data, and that are formally described using the Common Workflow Language, enabling greater provenance, reusability, and reproducibility. MGnify's new analysis pipelines offer additional approaches for taxonomic assertions based on ribosomal internal transcribed spacer regions (ITS1/2) and expanded protein functional annotations. Biochemical pathways and systems predictions have also been added for assembled contigs. MGnify's growing focus on the assembly of metagenomic data has also seen the number of datasets it has assembled and analysed increase six-fold. The non-redundant protein database constructed from the proteins encoded by these assemblies now exceeds 1 billion sequences. Meanwhile, a newly developed contig viewer provides fine-grained visualisation of the assembled contigs and their enriched annotations. Documentation: https://docs.mgnify.org/en/latest/analysis.html#raw-reads-analysis-pipeline""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.362.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "Workflows, CWL, Metagenomics" ; ns1:license ; ns1:name "MGnify - raw-reads analysis pipeline" ; ns1:output , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pedigree" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 GEMINI results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 Merged VCF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 Normalized VCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 Normalized VCFs (Removed )" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 SnpEff Annotated vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 VCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 VCFs (Fixed Header and Chr)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 VCFs.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Case 5 gene.iobio results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EGA Download Client: authorized datasets" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEMINI Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Case 5 VCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of files in EGAD00001008392" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SnpEff Annotated vcf_bgzip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-03-01T15:35:32Z"^^ns1:Date ; ns1:dateModified "2023-09-05T08:11:49Z"^^ns1:Date ; ns1:description """To discover causal mutations of inherited diseases it’s common practice to do a trio analysis. In a trio analysis DNA is sequenced of both the patient and parents. Using this method, it’s possible to identify multiple inheritance patterns. Some examples of these patterns are autosomal recessive, autosomal dominant, and de-novo variants, which are represented in the figure below. To elaborate, the most left tree shows an autosomal dominant inhertitance pattern where the offspring inherits a faulty copy of the gene from one of the parents. To discover these mutations either whole exome sequencing (WES) or whole genome sequencing (WGS) can be used. With these technologies it is possible to uncover the DNA of the parents and offspring to find (shared) mutations in the DNA. These mutations can include insertions/deletions (indels), loss of heterozygosity (LOH), single nucleotide variants (SNVs), copy number variations (CNVs), and fusion genes. In this workflow we will also make use of the HTSGET protocol, which is a program to download our data securely and savely. This protocol has been implemented in the EGA Download Client Tool: toolshed.g2.bx.psu.edu/repos/iuc/ega_download_client/pyega3/4.0.0+galaxy0 tool, so we don’t have to leave Galaxy to retrieve our data. We will not start our analysis from scratch, since the main goal of this tutorial is to use the HTSGET protocol to download variant information from an online archive and to find the causative variant from those variants. If you want to learn how to do the analysis from scratch, using the raw reads, you can have a look at the Exome sequencing data analysis for diagnosing a genetic disease tutorial.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.363.2" ; ns1:input ; ns1:isBasedOn ; ns1:keywords "variant-analysis" ; ns1:license ; ns1:name "Trio Analysis" ; ns1:output , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "inputdata_cesm_2_1_3_B1850_f19_g17_tar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "user_nl_cam_rs" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-06-12T19:01:32Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:01:23Z"^^ns1:Date ; ns1:description """This workflow demonstrates the usage of the [Community Earth System Model](https://www.cesm.ucar.edu/) on Galaxy Europe. A fully coupled B1850 compset with resolution f19_g17 is run for 1 month. ![](https://nordicesmhub.github.io/GEO4962/fig/newcase.png)""" ; ns1:image ; ns1:input , ; ns1:keywords "" ; ns1:license ; ns1:name "Workflow for running the Community Earth System Model in fully coupled mode" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of Paired Reads - Maternal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of Paired Reads - Paternal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/K-mer length " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Pacbio Hifi reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Ploidy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope linear plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope linear plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope linear plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope log plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope log plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope log plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope model (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope summary (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed linear plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed linear plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed linear plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed log plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed log plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed log plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Meryl mat.meryldb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Meryl pat.meryldb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Meryl read-db.meryldb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-06-14T02:01:13Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:33Z"^^ns1:Date ; ns1:description """# VGP Workflow #1 This workflow collects the metrics on the properties of the genome under consideration by analyzing the k-mer frequencies. It provides information about the genomic complexity, such as the genome size and levels of heterozygosity and repeat content, as well about the data quality. It uses reads from two parental genomes to partition long reads from the offspring into haplotype-specific k-mer databases. ### Inputs - Collection of Hifi long reads in FASTQ format - Paternal short-read Illumina sequencing reads in FASTQ format - Maternal short-read Illumina sequencing reads in FASTQ format ### Outputs - Meryl databases of k-mer counts - Child - Paternal haplotype - Maternal haplotype - GenomeScope metrics of child and parental genomes - Linear plot - Log plot - Transformed linear plot - Transformed log plot - Summary - Model - Model parameteres""" ; ns1:input , , , , ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "VGP-meryldb-creation-trio/main" ; ns1:output , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of Pacbio Data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/K-mer length " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Ploidy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope linear plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope log plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope model parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed linear plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GenomeScope transformed log plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merged Meryl Database" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-06-14T02:01:14Z"^^ns1:Date ; ns1:dateModified "2026-01-25T02:01:20Z"^^ns1:Date ; ns1:description """# VGP Workflow #1 This workflow produces a Meryl database and Genomescope outputs that will be used to determine parameters for following workflows, and assess the quality of genome assemblies. Specifically, it provides information about the genomic complexity, such as the genome size and levels of heterozygosity and repeat content, as well about the data quality. ### Inputs - Collection of Hifi long reads in FASTQ format ### Outputs - Meryl Database of kmer counts - GenomeScope - Linear plot - Log plot - Transformed linear plot - Transformed log plot - Summary - Model - Model parameteres""" ; ns1:input , , ; ns1:keywords "" ; ns1:license ; ns1:name "VGP-meryldb-creation/main" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotate bins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotate unbinned" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly choice" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bakta DB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run binning workflow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bracken levels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BUSCO dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Deduplicate illumina reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output Destination" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "eggnog_dbs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Fastq rich (ONT)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Deterministic Flye" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genome Size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gtdbtk data directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filter human reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Illumina reference filter db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan applications" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "InterProScan 5 directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Keep filtered reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SAPP kofamscan limit" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 confidence threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Kraken2 standard report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Adapter fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Disable adapter trimming" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "End adapter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filter human illumina reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum length limit" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mean quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum length required" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Poly_x_min_len" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Qualified_quality_phred" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Longread reference filter db" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "start_adapter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trim_front" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trim_poly_x" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trim_tail" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Memory usage (MB)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "When working with metagenomes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Oxford Nanopore reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Only spades assembler" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ONT Basecalling model used for MEDAKA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output BAM file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "PacBio reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run eggNOG-mapper" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use Flye" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run InterProScan" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run kofamscan" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run kraken2 on Illumina reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run Maxbin2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use Medaka" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use PyPolCA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run SemiBin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use SPAdes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SemiBin Environment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Skip bakta CRISPR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run Bracken" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input URLs used for this run" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Keep mapped reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use SPAdes scaffolds" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Binning output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read filtering output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read filtering output" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2025-09-09T12:28:46Z"^^ns1:Date ; ns1:dateModified "2026-01-23T10:25:06Z"^^ns1:Date ; ns1:description """**Workflow (hybrid) metagenomic assembly and binning**
- Workflow Illumina Quality: - Sequali (control) - hostile contamination filter - fastp (quality trimming) - Workflow Longread Quality: - NanoPlot (control) - fastplong (quality trimming) - hostile contamination filter - Kraken2 taxonomic classification of FASTQ reads - SPAdes/Flye (Assembly) - Medaka/PyPolCA (Assembly polishing) - QUAST (Assembly quality report) (optional) - Workflow binnning - Metabat2/MaxBin2/SemiBin - Binette - BUSCO - GTDB-Tk (optional) - Workflow Genome-scale metabolic models https://workflowhub.eu/workflows/372 - CarveMe (GEM generation) - MEMOTE (GEM test suite) - SMETANA (Species METabolic interaction ANAlysis) Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default

**All tool CWL files and other workflows can be found here:**
https://gitlab.com/m-unlock/cwl/
**How to setup and use an UNLOCK workflow:**
https://docs.m-unlock.nl/docs/workflows/setup.html
""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.367.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Metagenomics, Assembly, illumina, binning" ; ns1:license ; ns1:name "(Hybrid) Metagenomics workflow" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T22:57:27Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Detects SNPs and INDELs using VARSCAN2." ; ns1:image ; ns1:input , , ; ns1:keywords "Galaxy, VARSCAN2, SNPs, INDELs" ; ns1:license ; ns1:name "COVID-19: VARSCAN" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome." . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SARS-CoV-2 proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_42" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_43" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_44" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fasta" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genome/bin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output Destination (prov only)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier used" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "solver" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CarveMe GEMs folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GEMstats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MEMOTE outputs folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Protein files folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SMETANA output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-07-07T08:23:15Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:01:55Z"^^ns1:Date ; ns1:description """### Workflow for Metagenomics from bins to metabolic models (GEMs) **Summary** - Prodigal gene prediction - CarveMe genome scale metabolic model reconstruction - MEMOTE for metabolic model testing - SMETANA Species METabolic interaction ANAlysis Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default
**All tool CWL files and other workflows can be found here:**
Tools: https://gitlab.com/m-unlock/cwl
Workflows: https://gitlab.com/m-unlock/cwl/workflows **How to setup and use an UNLOCK workflow:**
https://m-unlock.gitlab.io/docs/setup/setup.html
""" ; ns1:image ; ns1:input , , , , ; ns1:keywords "Metagenomics, Genomics, GEM, carveme, memote" ; ns1:license ; ns1:name "Metagenomic GEMs from Assembly" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DLA Region - Training Model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DLA Text - Training Model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "html_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "list_output_txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_file1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-12-14T16:05:01Z"^^ns1:Date ; ns1:dateModified "2022-12-14T16:06:43Z"^^ns1:Date ; ns1:description "An example workflow to allow users to run the Specimen Data Refinery tools on data provided in an input CSV file." ; ns1:input , ; ns1:isBasedOn ; ns1:keywords "Default-SDR" ; ns1:license ; ns1:name "De novo digitisation" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-07-08T13:04:19Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:01:59Z"^^ns1:Date ; ns1:description "An example workflow for the Specimen Data Refinery tool, allowing an individual tool to be used" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.374.1" ; ns1:keywords "Default-SDR, multi-specimen-input, collections, validated-2022-06-29" ; ns1:license ; ns1:name "DLA-Collections-test" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Oliver Woolland" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-07-08T13:05:11Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:00Z"^^ns1:Date ; ns1:description "An example workflow for the Specimen Data Refinery tool, allowing an individual tool to be used" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.375.1" ; ns1:keywords "Default-SDR, multi-specimen-input, collections, validated-2022-06-29" ; ns1:license ; ns1:name "HTR-Collections-test" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-07-18T08:08:04Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:01Z"^^ns1:Date ; ns1:description """# ABR\\_Threshold_Detection ## What is this? This code can be used to automatically determine hearing thresholds from ABR hearing curves. One of the following methods can be used for this purpose: + neural network (NN) training, + calibration of a self-supervised sound level regression (SLR) method on given data sets with manually determined hearing thresholds. ## Installation: Run inside the [src](./src) directory: ### Installation as python package ``` pip install -e ./src (Installation as python package) ``` ### Installation as conda virtual environment ``` conda create -n abr_threshold_detection python=3.7 conda activate abr_threshold_detection conda install pip pip install -e ./src ``` ## Usage: Data files can be downloaded here: [https://zenodo.org/deposit/5779876](https://zenodo.org/deposit/5779876). For the Jupyter Notebooks (see the [`notebooks`](./notebooks) directory) to run, the path to the data has to be defined. For this, see the corresponding documentation of the respective notebooks. ### Using NNs (`./src/ABR_ThresholdFinder_NN`) The neural network models were trained in `./src/notebooks/GMCtrained_NN*_training.ipynb` with GMC data and in `./src/notebooks/INGtrained_NN*_training.ipynb` with ING data. ``` import ABR_ThresholdFinder_NN.data_preparation as dataprep from ABR_ThresholdFinder_NN.models import create_model_1, compile_model_1 ``` For automatic threshold detection based on NNs, `GMCtrained_NN_threshold_detection.ipynb` and `INGtrained_NN_threshold_detection.ipynb` in `./src/notebooks` can be used. ``` import ABR_ThresholdFinder_NN.data_preparation as dataprep import ABR_ThresholdFinder_NN.thresholder as abrthr ``` ### Using the SLR method (`./src/ABR_ThresholdFinder_SLR`) In `./src/notebooks/GMCcalibrated_SLR_threshold_detection.ipynb` and `./src/notebooks/INGcalibrated_SLR_threshold_detection.ipynb` it is shown how to use the module to: + train a threshold detector on a data set and estimate the thresholds + save a trained model + load a model + apply a trained threshold estimator to a data set + evaluate thresholds by comparing it to a ground truth + evaluate thresholds by analysing signal averages ``` import pandas as pd import numpy as np from ABR_ThresholdFinder_SLR import ABR_Threshold_Detector_multi_stimulus from ABR_ThresholdFinder_SLR.evaluations import evaluate_classification_against_ground_truth, plot_evaluation_curve_for_specific_stimulus ``` ##### Evaluate thresholds by comparing it with a 'ground truth' (a human set threshold in this case) For example: ``` # 5dB buffer evaluation = evaluate_classification_against_ground_truth(GMC_data2, 5, frequency = 'frequency', mouse_id = 'mouse_id', sound_level = 'sound_level', threshold_estimated = 'slr_estimated_thr', threshold_ground_truth = 'threshold') ``` ### Compute and plot evaluation curves that allow to judge the quality of a thresholding Four threshold types are evaluated and compared: + the threshols predicted with neural networks ('threshold NN') + the thresholds estimated by a sound level regression method ('threshold SLR') + the human ground truth ('threshold manual') + a constant threshold ('50') For more details, please see `Evaluation_of_ML_detected_thresholds.ipynb` in `./src/notebooks`. ## Folder structure: ### [`data`](./data) Contains the preprocessed ABR and mouse phenotyping datasets from GMC and Ingham et al. in csv format, as well as the mouse ID distributions stored as numpy arrays for neural networks training, validation and testing. ### [`models`](./models) Contains the trained models of the two neural networks and the SLR method, but also the predictions of the first neural network with which the second neural network was fed. ### [`models_cross-validation`](./models_cross-validation) Contains the models that resulted from the cross-validation of the neural networks. ### [`notebooks`](./notebooks) Contains the Jupyter notebooks used for training, testing and evaluation of the neural networks and the SLR method, as well as those used for the hearing curve analysis. ### [`notebooks_reports`](./notebooks_reports) Contains the contents of Jupyter notebooks in html format. ### [`results`](./results) Contains the predictions or estimates made by the neural networks or the SLR method for the two data sets from GMC and Ingham et al. but also all the plots made to analyse the results. ### [`src`](./src) Contains the Python scripts used in the Jupyter notebooks.""" ; ns1:keywords "Machine Learning" ; ns1:license ; ns1:name "ABR Threshold Detection" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-07-20T10:53:54Z"^^ns1:Date ; ns1:dateModified "2023-11-24T08:56:09Z"^^ns1:Date ; ns1:description "Lysozyme in Water simplest version, from COMPSs Tutorial. The original idea of this worklfow comes from http://www.mdtutorials.com/gmx/lysozyme/index.html" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.379.1" ; ns1:image ; ns1:keywords "GROMACS, PyCOMPSs, Marenostrum IV, Supercomputer, non_data_persistence" ; ns1:license ; ns1:name "Lysozyme in Water COMPSs workflow" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T23:03:09Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Alignment, assembly and annotation of RNQSEQ reads using TOPHAT (without filtering out host reads)." ; ns1:image ; ns1:input , , , ; ns1:keywords "Galaxy, Tophat2, Assembly, Alignment, RNASEQ, covid-19" ; ns1:license ; ns1:name "Assembly using Tophat2 and annotation (alternate)" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SARS-CoV-2 proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fast" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Geofile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Grib ATM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Grib ATM Namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Grib SFC" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Grib SFC Namelist " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Metgrid Namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Real Namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VTable ATM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VTable SFC" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "WRF namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-07-29T11:16:20Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:21Z"^^ns1:Date ; ns1:description "A prototype implementation of the Air Quality Prediction pipeline in Galaxy, using CWL tools." ; ns1:input , , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "Air Quality Prediction Prototype" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-09-02T10:48:32Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:24Z"^^ns1:Date ; ns1:description """# Snakemake workflow: dna-seq-varlociraptor [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/snakemake-workflows/dna-seq-varlociraptor/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/dna-seq-varlociraptor/actions?query=branch%3Amaster+workflow%3ATests) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4675661.svg)](https://doi.org/10.5281/zenodo.4675661) A Snakemake workflow for calling small and structural variants under any kind of scenario (tumor/normal, tumor/normal/relapse, germline, pedigree, populations) via the unified statistical model of [Varlociraptor](https://varlociraptor.github.io). ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows%2Fdna-seq-varlociraptor). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). """ ; ns1:keywords "" ; ns1:license ; ns1:name "dna-seq-varlociraptor workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Adrián Muñoz-Civico" . a ns1:Person ; ns1:name "Daniel López-López" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-10-11T10:58:35Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:25Z"^^ns1:Date ; ns1:description """# StructuralVariants Workflow """ ; ns1:isBasedOn ; ns1:keywords "CODEX2, TransBioNet, ExomeDepth, variant calling, cancer, manta, GRIDS, structural variants" ; ns1:license ; ns1:name "CNV_pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-16T20:41:04Z"^^ns1:Date ; ns1:dateModified "2023-05-16T22:01:12Z"^^ns1:Date ; ns1:description """# metaGOflow: A workflow for marine Genomic Observatories' data analysis ![logo](https://raw.githubusercontent.com/hariszaf/metaGOflow-use-case/gh-pages/assets/img/metaGOflow_logo_italics.png) ## An EOSC-Life project The workflows developed in the framework of this project are based on `pipeline-v5` of the MGnify resource. > This branch is a child of the [`pipeline_5.1`](https://github.com/hariszaf/pipeline-v5/tree/pipeline_5.1) branch > that contains all CWL descriptions of the MGnify pipeline version 5.1. ## Dependencies To run metaGOflow you need to make sure you have the following set on your computing environmnet first: - python3 [v 3.8+] - [Docker](https://www.docker.com) [v 19.+] or [Singularity](https://apptainer.org) [v 3.7.+]/[Apptainer](https://apptainer.org) [v 1.+] - [cwltool](https://github.com/common-workflow-language/cwltool) [v 3.+] - [rdflib](https://rdflib.readthedocs.io/en/stable/) [v 6.+] - [rdflib-jsonld](https://pypi.org/project/rdflib-jsonld/) [v 0.6.2] - [ro-crate-py](https://github.com/ResearchObject/ro-crate-py) [v 0.7.0] - [pyyaml](https://pypi.org/project/PyYAML/) [v 6.0] - [Node.js](https://nodejs.org/) [v 10.24.0+] - Available storage ~235GB for databases ### Storage while running Depending on the analysis you are about to run, disk requirements vary. Indicatively, you may have a look at the metaGOflow publication for computing resources used in various cases. ## Installation ### Get the EOSC-Life marine GOs workflow ```bash git clone https://github.com/emo-bon/MetaGOflow cd MetaGOflow ``` ### Download necessary databases (~235GB) You can download databases for the EOSC-Life GOs workflow by running the `download_dbs.sh` script under the `Installation` folder. ```bash bash Installation/download_dbs.sh -f [Output Directory e.g. ref-dbs] ``` If you have one or more already in your system, then create a symbolic link pointing at the `ref-dbs` folder or at one of its subfolders/files. The final structure of the DB directory should be like the following: ````bash user@server:~/MetaGOflow: ls ref-dbs/ db_kofam/ diamond/ eggnog/ GO-slim/ interproscan-5.57-90.0/ kegg_pathways/ kofam_ko_desc.tsv Rfam/ silva_lsu/ silva_ssu/ ```` ## How to run ### Ensure that `Node.js` is installed on your system before running metaGOflow If you have root access on your system, you can run the commands below to install it: ##### DEBIAN/UBUNTU ```bash sudo apt-get update -y sudo apt-get install -y nodejs ``` ##### RH/CentOS ```bash sudo yum install rh-nodejs (e.g. rh-nodejs10) ``` ### Set up the environment #### Run once - Setup environment - ```bash conda create -n EOSC-CWL python=3.8 ``` - ```bash conda activate EOSC-CWL ``` - ```bash pip install cwlref-runner cwltool[all] rdflib-jsonld rocrate pyyaml ``` #### Run every time ```bash conda activate EOSC-CWL ``` ### Run the workflow - Edit the `config.yml` file to set the parameter values of your choice. For selecting all the steps, then set to `true` the variables in lines [2-6]. #### Using Singularity ##### Standalone - run: ```bash ./run_wf.sh -s -n osd-short -d short-test-case -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz `` ##### Using a cluster with a queueing system (e.g. SLURM) - Create a job file (e.g., SBATCH file) - Enable Singularity, e.g. module load Singularity & all other dependencies - Add the run line to the job file #### Using Docker ##### Standalone - run: ``` bash ./run_wf.sh -n osd-short -d short-test-case -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz ``` HINT: If you are using Docker, you may need to run the above command without the `-s' flag. ## Testing samples The samples are available in the `test_input` folder. We provide metaGOflow with partial samples from the Human Metagenome Project ([SRR1620013](https://www.ebi.ac.uk/ena/browser/view/SRR1620013) and [SRR1620014](https://www.ebi.ac.uk/ena/browser/view/SRR1620014)) They are partial as only a small part of their sequences have been kept, in terms for the pipeline to test in a fast way. ## Hints and tips 1. In case you are using Docker, it is strongly recommended to **avoid** installing it through `snap`. 2. `RuntimeError`: slurm currently does not support shared caching, because it does not support cleaning up a worker after the last job finishes. Set the `--disableCaching` flag if you want to use this batch system. 3. In case you are having errors like: ``` cwltool.errors.WorkflowException: Singularity is not available for this tool ``` You may run the following command: ``` singularity pull --force --name debian:stable-slim.sif docker://debian:stable-sli ``` ## Contribution To make contribution to the project a bit easier, all the MGnify `conditionals` and `subworkflows` under the `workflows/` directory that are not used in the metaGOflow framework, have been removed. However, all the MGnify `tools/` and `utils/` are available in this repo, even if they are not invoked in the current version of metaGOflow. This way, we hope we encourage people to implement their own `conditionals` and/or `subworkflows` by exploiting the currently supported `tools` and `utils` as well as by developing new `tools` and/or `utils`. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.384.3" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "Biodiversity" ; ns1:license ; ns1:name "A workflow for marine Genomic Observatories data analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-01-23T11:16:11Z"^^ns1:Date ; ns1:dateModified "2024-03-07T09:01:21Z"^^ns1:Date ; ns1:description """BackTrackBB is a program for detection and space-time location of seismic sources based on multi-scale, frequency-selective statistical coherence of the wave field recorded by dense large-scale seismic networks and local antennas. The method is designed to enhance coherence of the signal statistical features across the array of sensors and consists of three steps. They are signal processing, space-time imaging and detection and location. Source with inputs and outputs included (too big for WorkflowHub): [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7788030.svg)](https://doi.org/10.5281/zenodo.7788030) More information: https://backtrackbb.github.io/""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.386.2" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "PyCOMPSs, Seismic Detection, BackTrackBB, Earthquake Detection, Marenostrum IV, Supercomputer, non_data_persistence" ; ns1:license ; ns1:name "BackTrackBB: Multi-band array detection and location of seismic sources" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "KNIME" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-09-26T09:15:13Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:31Z"^^ns1:Date ; ns1:description """Generates Dose-response curve fits on cell-based toxicity data. Outliers of replicate data-sets can be removed by setting a threshold for standard deviation (here set to 25). Curve fits for compounds showing low response can be removed by setting a threshold for minimum activity (here set to 75% confluence). This workflow needs R-Server to run in the back-end. Start R and run the following command: library(Rserve); Rserve(args = "--vanilla")""" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "DRC_template_toxicity" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "KNIME" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-09-26T09:58:39Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:32Z"^^ns1:Date ; ns1:description """This workflow can be used to fit dose-response curves from normalised cell-based assay data (%confluence) using the KNIME HCS extension. The workflow expects triplicates for each of eight test concentrations. This workflow needs R-Server to run in the back-end. Start R and run the following command: library(Rserve); Rserve(args = "--vanilla"). Three types of outliers can be removed: 1 - Outliers from triplicate measurement (standard deviation cut-off can be selected), 2 - inactive and weekly active compounds (% confluence cut-offs can be selected), 3 - toxic concentrations (cut-off for reduction in confluence with stepwise increasing concentration can be selected) Output are two dose-response curve fits per compound for pre and post outlier removal with graphical representation and numerical fit parameters. """ ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "DRC_cellbased_OutlierDetection" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "KNIME" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-09-26T10:15:37Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:33Z"^^ns1:Date ; ns1:description """This workflow can be used to fit dose-response curves from normalised biochemical assay data (%Inhibition) using the HCS extension. This workflow needs R-Server to run in the back-end. Start R and run the following command: library(Rserve); Rserve(args = "--vanilla") IC50 values will not be extrapolated outside the tested concentration range For activity classification the following criteria are applied: - maximum (average % inhibion) >25 % and slope is >0 and IC50 > 5 µM or - minimum (average % inhibion) >75 % Results are formatted for upload to the European Chemical Biology Database (ECBD)""" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "DRC_biochemical_toECBD" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T23:07:23Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Alignment, assembly RNASEQ reads and annotation of generated transcripts." ; ns1:image ; ns1:input , ; ns1:keywords "Unicycler, Alignment, Assembly, Annotation, RNASEQ, covid-19" ; ns1:license ; ns1:name "Unicycler assembly and annotation" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SARS-CoV-2 proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_42" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_43" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Alignment score from Kpax to analyse structures" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CATH family ids" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Obsolete and inconsistent CATH domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename for residue-mapped CATH domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Core domain structure (.pdb)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database to select to compute core average structure" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "To store all the domain-like StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "To store all failed domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename to store family ids per iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Iteration number starting from 0" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Threshold for minimum domain length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Parameter file for current iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "The directory for storing all PDB files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam family ids" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Obsolete and inconsistent Pfam domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename for residue-mapped Pfam domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CATH cross-mapped domain StIs from previous iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam cross-mapped domain StIs from previous iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Score threshold for given alignment score from Kpax" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Directory for storing all SIFTS files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "To store all the true domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename to store unmapped and not properly aligned instances from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename to store unmapped but structurally well aligned instances from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename to store unmapped and not properly aligned instances from Pfam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename to store unmapped but structurally well aligned instances from Pfam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename with alignment scores for unmapped instances" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Alignment results for CATH unmapped instances" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Alignment results for Pfam unmapped instances" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Domain-like StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Failed domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All CATH cross-mapped domin StIs family-wise together" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All Pfam domain StIs cross-mapped to CATH family-wise" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Alignment results from Kpax for all cross-mapped families" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Average structures per cross-mapped Pfam family for CATH StIs at family level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Core domain StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Core domain structure (.pdb)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CATH domain StIs cross-mapped to Pfam family-wise" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam domin StIs cross-mapped to CATH family-wise" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cross-mapped families with CATH domain StIs passing the threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cross-mapped families with Pfam domain StIs passing the threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merged cross-mapped and residue-mapped domain StIs from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merged cross-mapped and residue-mapped domain StIs from Pfam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Family ids per iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Parameter file for next iteration of the workflow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Average structures per cross-mapped CATH family for Pfam StIs at family level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Obsolete and inconsistent domain StIs from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Obsolete and inconsistent domain StIs from Pfam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All CATH residue-mapped domain StIs with domain labels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All Pfam residue-mapped domain StIs with domain labels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "True domain StIs per iteration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All un-mapped domin StIs from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Failed domain StIs from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Domain-like StIs from CATH" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All Pfam un-mapped domin StIs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Failed domain StIs from Pfam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Domain-like StIs from Pfam" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-06-20T12:06:49Z"^^ns1:Date ; ns1:dateModified "2023-06-20T12:08:47Z"^^ns1:Date ; ns1:description """# CroMaSt: A workflow for assessing protein domain classification by cross-mapping of structural instances between domain databases and structural alignment CroMaSt (**Cro**ss **Ma**pper of domain **St**ructural instances) is an automated iterative workflow to clarify the assignment of protein domains to a given domain type of interest, based on their 3D structure and by cross-mapping of domain structural instances between domain databases. CroMaSt (for Cross-Mapper of domain Structural instances) will classify all structural instances of a given domain type into 4 different categories (**Core**, **True**, **Domain-like**, and **Failed**). ## Requirements 1. [Conda](https://docs.conda.io/projects/conda/en/latest/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) 2. [Kpax](http://kpax.loria.fr/download.php) Download and install conda (or Miniconda) and Kpax by following the instructions from their official site. ## Get it running (Considering the requirements are already met) 1. Clone the repository and change the directory ``` git clone https://gitlab.inria.fr/capsid.public_codes/CroMaSt.git cd CroMaSt ``` 2. Create the conda environment for the workflow ``` conda env create --file yml/environment.yml conda activate CroMaSt ``` 3. Change the path of variables in paramter file ``` sed -i 's/\\/home\\/hdhondge\\/CroMaSt\\//\\/YOUR\\/PATH\\/TO_CroMaSt\\//g' yml/CroMaSt_input.yml ``` 4. Create the directory to store files from PDB and SIFTS (if not already) ``` mkdir PDB_files SIFTS ``` 5. Download the source input data ``` cwl-runner Tools/download_data.cwl yml/download_data.yml ``` ## Basic example ### 1. First, we will run the workflow for the KH domain with family identifiers `RRM_1` and `RRM` in Pfam and CATH, respectively. Run the workflow - ``` cwl-runner --parallel --outdir=Results/ CroMaSt.cwl yml/CroMaSt_input.yml ``` ### 2. Once the iteration is complete, check the `new_param.yml` file from the `outputdir` (Results), if there is any family identifier in either `pfam` or `cath`; run the next iteration using following command (Until there is no new families explored by workflow) - ``` cwl-runner --parallel --outdir=Results/ CroMaSt.cwl Results/new_param.yml ``` ### **Extra:** Start the workflow with multiple families from one or both databases If you would like to start the workflow with multiple families from one or both databases, then simply add a comma in between two family identifiers. ``` pfam: ['PF00076', 'PF08777'] cath: ['3.30.70.330'] ``` - **Pro Tip**: Don't forget to give different path to `--outdir` option while running the workflow multiple times or at least move the results to some other location after first run. ## Run the workflow for protein domain of your choice ### 1. You can run the workflow for the domain of your choice by simply changing the family identifers in `yml/CroMaSt_input.yml` file. Simply replace the following values of family identifiers (for pfam and cath) with the family identifiers of your choice in `yml/CroMaSt_input.yml` file. ``` pfam: ['PF00076'] cath: ['3.30.70.330'] ``` ## Data files used in current version are as follows: **Files in Data directory can be downloaded as follows**: 1. File used from Pfam database: [pdbmap.gz](http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam35.0/pdbmap.gz) 2. File used from CATH database: [cath-domain-description-file.txt](ftp://orengoftp.biochem.ucl.ac.uk:21/cath/releases/latest-release/cath-classification-data/cath-domain-description-file.txt) 3. Obsolete entries from RCSB PDB [obsolete_PDB_entry_ids.txt](https://data.rcsb.org/rest/v1/holdings/removed/entry_ids) CATH Version - 4.3.0 (Ver_Date - 11-Sep-2019) [FTP site](ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/latest-release/cath-classification-data/) Pfam Version - 35.0 (Ver_Date - November-2021) [FTP site](http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam35.0/) ## Reference ``` Poster - 1. Hrishikesh Dhondge, Isaure Chauvot de Beauchêne, Marie-Dominique Devignes. CroMaSt: A workflow for domain family curation through cross-mapping of structural instances between protein domain databases. 21st European Conference on Computational Biology, Sep 2022, Sitges, Spain. ⟨hal-03789541⟩ ``` ## Acknowledgements This project has received funding from the Marie Skłodowska-Curie Innovative Training Network (MSCA-ITN) RNAct supported by European Union’s Horizon 2020 research and innovation programme under granta greement No 813239. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.390.2" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Pfam, CATH, Protein domains, data integration" ; ns1:license ; ns1:name "CroMaSt: A workflow for assessing protein domain classification by cross-mapping of structural instances between domain databases and structural alignment" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-10-12T02:34:32Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:02:39Z"^^ns1:Date ; ns1:description """# IndexReferenceFasta-nf =========== - [Description](#description) - [Diagram](#diagram) - [User guide](#user-guide) - [Benchmarking](#benchmarking) - [Workflow summaries](#workflow-summaries) - [Metadata](#metadata) - [Component tools](#component-tools) - [Required (minimum) inputs/parameters](#required-minimum-inputsparameters) - [Additional notes](#additional-notes) - [Help/FAQ/Troubleshooting](#helpfaqtroubleshooting) - [Acknowledgements/citations/credits](#acknowledgementscitationscredits) --- ## Description This is a flexible pipeline for generating common reference genome index files for WGS data analysis. IndexReferenceFasta-nf is a Nextflow (DSL2) pipeline that runs the following tools using Singularity containers: * Samtools faidx * BWA index * GATK CreateSequenceDictionary ## Diagram

## User guide **1. Set up** Clone this repository by running: ``` git clone https://github.com/Sydney-Informatics-Hub/IndexReferenceFasta-nf.git cd IndexReferenceFasta-nf ``` **2. Generate indexes** Users can specify which index files to create by using the `--samtools`, `--bwa`, and/or `--gatk` flags. All are optional. Run the pipeline with: ``` nextflow run main.nf /path/to/ref.fasta --bwa --samtools --gatk ``` ## Benchmarking ### Human hg38 reference assembly @ Pawsey's Nimbus (NCPU/task = 1) |task_id|hash |native_id|name |status |exit|submit |duration |realtime |%cpu |peak_rss|peak_vmem|rchar |wchar | |-------|---------|---------|--------------|---------|----|-------|----------|----------|-------|--------|---------|-------|-------| |3 |27/33fffc|131621 |samtools_index|COMPLETED|0 |55:44.9|12.2s |12s |99.20% |6.3 MB |11.8 MB |3 GB |19.1 KB| |1 |80/f03e46|131999 |gatk_index |COMPLETED|0 |55:46.7|22.6s |22.3s |231.90%|3.8 GB |37.1 GB |3.1 GB |726 KB | |2 |ea/e29535|131594 |bwa_index |COMPLETED|0 |55:44.9|1h 50m 16s|1h 50m 15s|99.50% |4.5 GB |4.5 GB |12.1 GB|8.2 GB | ## Workflow summaries ### Metadata |metadata field | workflow_name / workflow_version | |-------------------|:---------------------------------:| |Version | workflow_version | |Maturity | under development | |Creators | Georgie Samaha | |Source | NA | |License | GPL-3.0 license | |Workflow manager | NextFlow | |Container | None | |Install method | Manual | |GitHub | Sydney-Informatics-Hub/IndexReferenceFasta-nf | |bio.tools | NA | |BioContainers | NA | |bioconda | NA | ### Component tools * samtools/1.15.1 * gatk/4.2.6.1 * bwa/0.7.17 ### Required (minimum) inputs/parameters * A reference genome file in fasta format. ## Additional notes ### Help/FAQ/Troubleshooting ## Acknowledgements/citations/credits ### Authors - Georgie Samaha (Sydney Informatics Hub, University of Sydney) ### Acknowledgements - This pipeline was built using the [Nextflow DSL2 template](https://github.com/Sydney-Informatics-Hub/Nextflow_DSL2_template). - Documentation was created following the [Australian BioCommons documentation guidelines](https://github.com/AustralianBioCommons/doc_guidelines). ### Cite us to support us! Acknowledgements (and co-authorship, where appropriate) are an important way for us to demonstrate the value we bring to your research. Your research outcomes are vital for ongoing funding of the Sydney Informatics Hub and national compute facilities. We suggest including the following acknowledgement in any publications that follow from this work: The authors acknowledge the technical assistance provided by the Sydney Informatics Hub, a Core Research Facility of the University of Sydney and the Australian BioCommons which is enabled by NCRIS via Bioplatforms Australia. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.393.1" ; ns1:keywords "Bioinformatics, Nextflow, WGS, index, referencegenome, SAMTools, GATK, BWA, Genomics" ; ns1:license ; ns1:name "IndexReferenceFasta-nf" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Lucille Delisle" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/PE fastq input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/adapter_forward" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/adapter_reverse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/effective_genome_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/normalize_profile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/BAM filtered rmDup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Coverage from MACS2 (bigwig)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 peaks xls" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 summits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Mapping stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MarkDuplicates metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC on input dataset(s): Stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC webpage" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-04-08T02:01:49Z"^^ns1:Date ; ns1:dateModified "2026-04-08T02:01:49Z"^^ns1:Date ; ns1:description "Complete CUT&RUN/CUT&TAG analysis workflow for paired-end sequencing data. Processes raw FASTQ files through adapter removal (cutadapt) and alignment (Bowtie2 with dovetail option enabled). Applies quality filtering (MAPQ ≥ 30, concordant pairs only), converts BAM to BED format, and performs peak calling using MACS2 with parameters optimized for the punctate signal profile characteristic of CUT&RUN/CUT&TAG experiments." ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:keywords "CUTnRUN" ; ns1:license ; ns1:name "cutandrun/main" ; ns1:output , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 19 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Adapter sequence" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Effective genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Normalize profile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Percentage of bad quality bases per read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SR fastq input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 peaks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 summits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC on input dataset(s): Stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/coverage from MACS2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered BAM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapping stats" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-04-11T02:02:29Z"^^ns1:Date ; ns1:dateModified "2026-04-11T02:02:29Z"^^ns1:Date ; ns1:description "Complete ChIP-seq analysis for single-end sequencing data. Processes raw FASTQ files through adapter removal (fastp), alignment to reference genome (Bowtie2), and quality filtering (MAPQ greater than 30). Peak calling with MACS2 uses a fixed extension of 200bp to identify protein-DNA binding sites. Generates alignment files, coverage, peak calls, and quality metrics for downstream analysis." ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:keywords "ChIP-seq" ; ns1:license ; ns1:name "chipseq-sr/main" ; ns1:output , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 17 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Effective genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Normalize profile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/PE fastq input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Percentage of bad quality bases per read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 peaks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 summits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC on input dataset(s): Stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/coverage from MACS2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered BAM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapping stats" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-04-11T02:02:33Z"^^ns1:Date ; ns1:dateModified "2026-04-11T02:02:33Z"^^ns1:Date ; ns1:description "Complete ChIP-seq analysis for paired-end sequencing data. Processes raw FASTQ files through adapter removal (fastp), alignment to reference genome (Bowtie2), and stringent quality filtering (MAPQ greater than 30, concordant pairs only). Peak calling with MACS2 optimized for paired-end reads identifies protein-DNA binding sites. Generates alignment files, coverage, peak calls, and quality metrics for downstream analysis." ; ns1:input , , , , ; ns1:isBasedOn ; ns1:keywords "ChIP-seq" ; ns1:license ; ns1:name "chipseq-pe/main" ; ns1:output , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 18 . a ns1:Person ; ns1:name "Lucille Delisle" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Bin size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Effective genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/PE fastq input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Percentage of bad quality bases per read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference Genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/1kb around summits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/BAM filtered rmDup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Coverage from MACS2 (bigwig)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MACS2 report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MarkDuplicates metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC on input dataset(s): Stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MultiQC webpage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Nb of reads in summits +-500bp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bigwig_norm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bigwig_norm2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/histogram of fragment length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mapping stats" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-04-18T02:01:43Z"^^ns1:Date ; ns1:dateModified "2026-04-18T02:01:43Z"^^ns1:Date ; ns1:description "Complete ATAC-seq analysis pipeline for paired-end reads. Processes raw FASTQ data through adapter and bad quality removal (fastp), alignment (Bowtie2 end-to-end), and filtering (removes MT reads, discordant pairs, low mapping quality below 30, PCR duplicates). Generates 5' cut site pileups (±100bp), performs peak calling, and quantifies reads in 1kb summit-centered regions. Produces two normalized coverage tracks (per million mapped reads and per million reads in peaks) and fragment length distribution plots for quality assessment." ; ns1:input , , , , ; ns1:isBasedOn ; ns1:keywords "ATACseq" ; ns1:license ; ns1:name "atacseq/main" ; ns1:output , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 20 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T23:13:25Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Alignment, assembly and annotation of RNASEQ reads as well as annotation of generated transcripts." ; ns1:image ; ns1:input , , , ; ns1:keywords "Alignment, Assembly, Annotation, RNASEQ, StringTie, covid-19" ; ns1:license ; ns1:name "StringTie assembly and annotation" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-04-10T10:45:00Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:39:45Z"^^ns1:Date ; ns1:description """Virus genome assembly with Unicycler and Spades, The 2 assemblers works in parallel. The graph visualization is made with Bandage. workflow git repository : https://github.com/fjrmoreews/cwl-workflow-SARS-CoV-2/blob/master/Assembly/workflow/assembly-wf-virus.cwl Based on https://github.com/galaxyproject/SARS-CoV-2/blob/master/genomics/2-Assembly/as_wf.png """ ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "covid-19, Assembly" ; ns1:license ; ns1:name "Virus genome assembly with Unicycler and Spades." ; ns1:output , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "List of Illumina accessions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome." . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SARS-CoV-2 proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_42" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_43" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_44" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_45" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_46" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_47" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_48" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_49" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_50" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_51" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_52" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_53" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_54" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_55" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_56" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_57" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_58" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fast" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastq" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name "Lucille Delisle" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Collection of FASTQ files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compute Cufflinks FPKM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compute StringTie FPKM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward adapter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF file of annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF with regions to exclude from FPKM normalization with Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Generate additional QC reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Strandedness" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use featureCounts for generating count tables" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Counts Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Gene Abundance Estimates from StringTie" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genes Expression from Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mapped Reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Small MultiQC HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Small MultiQC stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stranded Coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Transcripts Expression from Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Unstranded Coverage" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-02-18T03:01:52Z"^^ns1:Date ; ns1:dateModified "2026-02-18T03:01:52Z"^^ns1:Date ; ns1:description "Complete RNA-Seq analysis for single-end data: Processes raw FASTQ data through adapter and bad quality removal (fastp), alignment with STAR using ENCODE parameters, gene quantification via multiple methods (STAR and featureCounts), and expression calculation (FPKM with Cufflinks/StringTie, normalized coverage with bedtools). Produces count tables, normalized expression values, and genomic coverage tracks. Supports stranded and unstranded libraries, generating both HTSeq-compatible counts and normalized measures for downstream analysis." ; ns1:input , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "RNASEQ" ; ns1:license ; ns1:name "rnaseq-sr/main" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 13 . a ns1:Person ; ns1:name "Lucille Delisle" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Collection paired FASTQ files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compute Cufflinks FPKM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compute StringTie FPKM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward adapter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF file of annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF with regions to exclude from FPKM normalization with Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Generate additional QC reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse adapter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Strandedness" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Use featureCounts for generating count tables" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Counts Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Gene Abundance Estimates from StringTie" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genes Expression from Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mapped Reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MultiQC stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Small MultiQC HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Small MultiQC stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Stranded Coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Transcripts Expression from Cufflinks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Unstranded Coverage" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-02-18T03:01:49Z"^^ns1:Date ; ns1:dateModified "2026-02-18T03:01:49Z"^^ns1:Date ; ns1:description "Complete RNA-Seq analysis for paired-end data: Processes raw FASTQ data through adapter and bad quality removal (fastp), alignment with STAR using ENCODE parameters, gene quantification via multiple methods (STAR and featureCounts), and expression calculation (FPKM with Cufflinks/StringTie, normalized coverage with bedtools). Produces count tables, normalized expression values, and genomic coverage tracks. Supports stranded and unstranded libraries, generating both HTSeq-compatible counts and normalized measures for downstream analysis." ; ns1:input , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "RNASEQ" ; ns1:license ; ns1:name "rnaseq-pe/main" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 14 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-24T13:50:34Z"^^ns1:Date ; ns1:dateModified "2023-11-24T13:54:50Z"^^ns1:Date ; ns1:description "This workflow is designed to analyze to a multi-omics data set that comprises genome-wide DNA methylation profiles, targeted metabolomics, and behavioral data of two cohorts that participated in the ACTION Biomarker Study (ACTION, Aggression in Children: Unraveling gene-environment interplay to inform Treatment and InterventiON strategies. (Boomsma 2015, Bartels 2018, Hagenbeek 2020, van Dongen 2021, Hagenbeek 2022). The ACTION-NTR cohort consists of twins that are either longitudinally concordant or discordant for childhood aggression. The ACTION-Curium-LUMC cohort consists of children referred to the Dutch LUMC Curium academic center for child and youth psychiatry. With the joint analysis of multi-omics data and behavioral data, we aim to identify substructures in the ACTION-NTR cohort and link them to aggressive behavior. First, the individuals are clustered using Similarity Network Fusion (SNF, Wang 2014), and latent feature dimensions are uncovered using different unsupervised methods including Multi-Omics Factor Analysis (MOFA) (Argelaguet 2018) and Multiple Correspondence Analysis (MCA, Lê 2008, Husson 2017). In a second step, we determine correlations between -omics and phenotype dimensions, and use them to explain the subgroups of individuals from the ACTION-NTR cohort. In order to validate the results, we project data of the ACTION-Curium-LUMC cohort onto the latent dimensions and determine if correlations between omics and phenotype data can be reproduced." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.402.8" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Multi-omics, Metabolomics, Epigenomics, Behavioral data, FAIR, Nextflow" ; ns1:license ; ns1:name "X-omics ACTIONdemonstrator analysis workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 8 . a ns1:Person ; ns1:name "Kate Farquharson" . a ns1:Person ; ns1:name "Simon Tang" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/FASTA contigs - Primary Assembly " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reads that were used for the assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Busco and dependencies version" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Busco on input dataset(s): full table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Fasta Statistics on input dataset(s): summary stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Genome assembly metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Genome coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): bed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): png" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): qv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): size files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merqury on input dataset(s): wig" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Meryl on input dataset(s): read-db.meryldb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Quast on input dataset(s): HTML report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Quast on input dataset(s): PDF report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Quast on input dataset(s): Log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Quast on input dataset(s): tabular report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/out_file1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/outfile" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2026-04-17T09:06:28Z"^^ns1:Date ; ns1:dateModified "2026-04-17T09:08:06Z"^^ns1:Date ; ns1:description """Post-genome assembly quality control workflow using Quast, BUSCO, Meryl, Merqury and Fasta Statistics, with updates November 2024. Workflow inputs: reads as fastqsanger.gz (not fastq.gz), and primary assembly.fasta. (To change reads format: click on the pencil icon next to the file in the Galaxy history, then "Datatypes", then set "New type" as fastqsanger.gz). Note: the reads should be those that were used for the assembly (i.e., the filtered/cleaned reads), not the raw reads. What it does: Computes read coverage. Runs Quast. Runs Fasta Statistics. Runs Meryl and Merqury. Runs Busco. (New default settings for BUSCO: lineage = eukaryota; for Quast: lineage = eukaryotes, genome = large.) Workflow outputs: Reports assembly stats into a table called metrics.tsv, including selected metrics from Fasta Stats, and read coverage; reports BUSCO versions and dependencies; and displays these tables in the workflow report. Note: a known bug is that sometimes the workflow report text resets to default text. To check and restore: open the workflow in Galaxy for editing. Click on the "Edit Report" icon (top right, pencil icon). Copy and paste the following text into the workflow report, then exit this report page, then save the workflow. # Workflow Execution Report Workflow name: Genome assessment post assembly ## Genome assembly metrics Selected statistics from the workflow outputs. Additional metrics are available in other outputs in the history. ```galaxy history_dataset_display(output="Genome assembly metrics") ``` ## Software Busco version and dependencies: ```galaxy history_dataset_display(output="Busco and dependencies version") ``` ## Galaxy Australia Thanks for using Galaxy! When you use Galaxy Australia to support your publication or project, please acknowledge its use with the following statement: "This work is supported by Galaxy Australia, a service provided by the Australian Biocommons and its partners. The service receives NCRIS funding through Bioplatforms Australia and the Australian Research Data Commons (https://doi.org/10.47486/PL105), as well as The University of Melbourne and Queensland Government RICF funding." """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.403.10" ; ns1:input , ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "HiFi, hifiasm, QC, Quast, Meryl, Merqury, BUSCO" ; ns1:license ; ns1:name "Genome-assessment-post-assembly" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 10 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-11-18T14:39:27Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:45:44Z"^^ns1:Date ; ns1:description """This Galaxy-E workflow was made from the ["Cleaning GBIF data for the use in biogeography" tutorial](https://ropensci.github.io/CoordinateCleaner/articles/Cleaning_GBIF_data_with_CoordinateCleaner.html) and allows to: - Use CoordinateCleaner to automatically flag problematic records - Use GBIF provided meta-data to improve coordinate quality, tailored to your downstream analyses - Use automated cleaning algorithms of CoordinateCleaner to identify problematic contributing datasets - Visualize data on a map""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.404.1" ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "GBIF data Quality check and filtering workflow Feb-2020" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Plasmids" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2022-11-24T13:28:47Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:44Z"^^ns1:Date ; ns1:description "Workflow for the GTN training \"Antibiotic resistance detection\"" ; ns1:input ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "GTN Training - Antibiotic Resistance Detection" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-11-24T13:42:42Z"^^ns1:Date ; ns1:dateModified "2024-09-09T08:23:33Z"^^ns1:Date ; ns1:description """With this galaxy pipeline you can use Salmonella sp. next generation sequencing results to predict bacterial AMR phenotypes and compare the results against gold standard Salmonella sp. phenotypes obtained from food. This pipeline is based on the work of the National Food Agency of Canada. Doi: [10.3389/fmicb.2020.00549](https://doi.org/10.3389/fmicb.2020.00549) | tool | version | license | | -- | -- | -- | | SeqSero2 | 1.2.1 | [GNU GPL v2.0](https://github.com/denglab/SeqSero2/blob/master/LICENSE) | | BBTools | 39.01 | [MIT License](https://github.com/kbaseapps/BBTools/blob/master/LICENSE) | | SRST2 | 0.2.0 | [BSD License](https://github.com/katholt/srst2/blob/master/LICENSE.txt) | | hamronize | 1.0.3 | [GNU LGPL v3.0](https://github.com/pha4ge/hAMRonization/blob/master/LICENSE.txt) | | SPAdes | v3.15.5 | [GNU GPL v2.0](https://github.com/ablab/spades/blob/main/LICENSE) | | SKESA | 3.0.0 | [Public Domain](https://github.com/ncbi/SKESA/blob/master/LICENSE) | | pilon | 1.1.0 | [GNU GPL v2.0](https://github.com/broadinstitute/pilon/blob/master/LICENSE) | | shovill | 1.0.4 | [GPL-3.0 license](https://github.com/tseemann/shovill/blob/master/LICENSE) | | sistr | 1.1.1 | [Apache-2.0 license](https://github.com/phac-nml/sistr_cmd/blob/master/LICENSE) | | MOB-Recon | 3.0.3 | [Apache-2.0 license](https://github.com/phac-nml/mob-suite/blob/master/LICENSE) |""" ; ns1:image ; ns1:keywords "Bioinformatics, antimicrobial resistance" ; ns1:license ; ns1:name "Workflow 3: AMR - SeqSero2/SISTR" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-11-29T20:09:43Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:04:54Z"^^ns1:Date ; ns1:description """[![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) [![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map) ## Reads2Map Reads2Map presents a collection of [WDL workflows](https://openwdl.org/) to build linkage maps from sequencing reads. Each workflow release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build. ![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png) ## How to use Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies). ## Documentation Check the description of the inputs for the pipelines: * [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Reads2Map/EmpiricalReads.html) * [SimulatedReads2Map](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html) Check how to evaluate the workflows results in Reads2MapApp Shiny: * [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map: * [A Guide to Build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html) Check more information and examples of usage in: * [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., & Franco Garcia, A. A. Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v2) ## Third-party software and images - [BWA](https://github.com/lh3/bwa) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Used to align simulated reads to reference; - [cutadapt](https://github.com/marcelm/cutadapt) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Trim simulated reads; - [ddRADseqTools](https://github.com/GGFHF/ddRADseqTools) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Set of applications useful to in silico design and testing of double digest RADseq (ddRADseq) experiments; - [Freebayes](https://github.com/ekg/freebayes) in [Cristaniguti/freebayes:0.0.1](): Variant call step; - [GATK](https://github.com/broadinstitute/gatk) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Variant call step using Haplotype Caller, GenomicsDBImport and GenotypeGVCFs; - [PedigreeSim](https://github.com/PBR/pedigreeSim?files=1) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Simulates progeny genotypes from parents genotypes for different types of populations; - [picard](https://github.com/broadinstitute/picard) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files; - [pirs](https://github.com/galaxy001/pirs) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): To generate simulates paired-end reads from a reference genome; - [samtools](https://github.com/samtools/samtools) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files; - [SimuSCoP](https://github.com/qasimyu/simuscop) in [cristaniguti/simuscopr:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/simuscopr): Exome and WGS Illumina reads simulations; - [RADinitio](http://catchenlab.life.illinois.edu/radinitio/) in [ cristaniguti/radinitio:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/radinitio): RADseq Illumina reads simulation; - [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Efficient Exact Maximum a Posteriori Computation for Bayesian SNP Genotyping in Polyploids; - [bcftools](https://github.com/samtools/bcftools) in [lifebitai/bcftools:1.10.2](https://hub.docker.com/r/lifebitai/bcftools): utilities for variant calling and manipulating VCFs and BCFs; - [vcftools](http://vcftools.sourceforge.net/) in [cristaniguti/split_markers:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/split_markers): program package designed for working with VCF files. - [MCHap](https://github.com/PlantandFoodResearch/MCHap) in [cristaniguti/mchap:0.7.0](https://hub.docker.com/repository/docker/cristaniguti/mchap): Polyploid micro-haplotype assembly using Markov chain Monte Carlo simulation. ### R packages - [OneMap](https://github.com/augusto-garcia/onemap) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Is a software for constructing genetic maps in experimental crosses: full-sib, RILs, F2 and backcrosses; - [Reads2MapTools](https://github.com/Cristianetaniguti/Reads2MapTools) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Support package to perform mapping populations simulations and genotyping for OneMap genetic map building - [GUSMap](https://github.com/tpbilton/GUSMap): Genotyping Uncertainty with Sequencing data and linkage MAPping - [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data - [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids - [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results - [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.409.1" ; ns1:keywords "WDL, linkage_map, variant_calling" ; ns1:license ; ns1:name "EmpiricalReads2Map" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-06-18T23:17:39Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Alignment, assembly and annotation of generated transcripts from RNASEQ reads." ; ns1:image ; ns1:input , , , ; ns1:keywords "Alignment, Assembly, Annotation, Tophat2, RNASEQ, covid-19" ; ns1:license ; ns1:name "Assembly using Tophat2 and annotation" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-11-29T20:17:01Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:04:54Z"^^ns1:Date ; ns1:description """[![Development](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) [![Reads2Map](https://circleci.com/gh/Cristianetaniguti/Reads2Map.svg?style=svg)](https://app.circleci.com/pipelines/github/Cristianetaniguti/Reads2Map) ## Reads2Map Reads2Map presents a collection of [WDL workflows](https://openwdl.org/) to build linkage maps from sequencing reads. Each workflow release is described in the [Read2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases). The main workflows are the `EmpiricalReads2Map.wdl` and the `SimulatedReads2Map.wdl`. The `EmpiricalReads2Map.wdl` is composed by the `EmpiricalSNPCalling.wdl` that performs the SNP calling, and the `EmpiricalMaps.wdl` that performs the genotype calling and map building in empirical reads. The `SimulatedReads2Map.wdl` simulates Illumina reads for RADseq, exome, or WGS data and performs the SNP and genotype calling and genetic map building. By now, [GATK](https://github.com/broadinstitute/gatk), [Freebayes](https://github.com/ekg/freebayes) are included for SNP calling; [updog](https://github.com/dcgerard/updog), [polyRAD](https://github.com/lvclark/polyRAD), [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) for dosage calling; and [OneMap](https://github.com/augusto-garcia/onemap), and [GUSMap](https://github.com/tpbilton/GUSMap) for linkage map build. ![math_meth2](https://user-images.githubusercontent.com/7572527/203172239-e4d2d857-84e2-48c5-bb88-01052a287004.png) ## How to use Multiple systems are available to run WDL workflows such as Cromwell, miniWDL, and dxWDL. See further information in the [openwdl documentation](https://github.com/openwdl/wdl#execution-engines). To run a pipeline, first navigate to [Reads2Map releases page](https://github.com/Cristianetaniguti/Reads2Map/releases), search for the pipeline tag you which to run, and download the pipeline’s assets (the WDL workflow, the JSON, and the ZIP with accompanying dependencies). ## Documentation Check the description of the inputs for the pipelines: * [EmpiricalReads2Map (EmpiricalSNPCalling and EmpiricalMaps)](https://cristianetaniguti.github.io/Tutorials/Reads2Map/EmpiricalReads.html) * [SimulatedReads2Map](https://cristianetaniguti.github.io/Tutorials/Reads2Map/simulatedreads.html) Check how to evaluate the workflows results in Reads2MapApp Shiny: * [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) Once you selected the best pipeline using a subset of your data, you can build a complete high-density linkage map: * [A Guide to Build High-Density Linkage Maps](https://cristianetaniguti.github.io/Tutorials/onemap/Quick_HighDens/High_density_maps.html) Check more information and examples of usage in: * [Taniguti, C. H., Taniguti, L. M., Amadeu, R. R., Mollinari, M., Da, G., Pereira, S., Riera-Lizarazu, O., Lau, J., Byrne, D., de Siqueira Gesteira, G., De, T., Oliveira, P., Ferreira, G. C., & Franco Garcia, A. A. Developing best practices for genotyping-by-sequencing analysis using linkage maps as benchmarks. BioRxiv. https://doi.org/10.1101/2022.11.24.517847](https://www.biorxiv.org/content/10.1101/2022.11.24.517847v2) ## Third-party software and images - [BWA](https://github.com/lh3/bwa) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Used to align simulated reads to reference; - [cutadapt](https://github.com/marcelm/cutadapt) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Trim simulated reads; - [ddRADseqTools](https://github.com/GGFHF/ddRADseqTools) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): Set of applications useful to in silico design and testing of double digest RADseq (ddRADseq) experiments; - [Freebayes](https://github.com/ekg/freebayes) in [Cristaniguti/freebayes:0.0.1](): Variant call step; - [GATK](https://github.com/broadinstitute/gatk) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Variant call step using Haplotype Caller, GenomicsDBImport and GenotypeGVCFs; - [PedigreeSim](https://github.com/PBR/pedigreeSim?files=1) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Simulates progeny genotypes from parents genotypes for different types of populations; - [picard](https://github.com/broadinstitute/picard) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files; - [pirs](https://github.com/galaxy001/pirs) in [cristaniguti/ pirs-ddrad-cutadapt:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/pirs-ddrad-cutadapt): To generate simulates paired-end reads from a reference genome; - [samtools](https://github.com/samtools/samtools) in [us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.5.7-2021-06-09_16-47-48Z](https://console.cloud.google.com/gcr/images/broad-gotc-prod/US/genomes-in-the-cloud): Process alignment files; - [SimuSCoP](https://github.com/qasimyu/simuscop) in [cristaniguti/simuscopr:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/simuscopr): Exome and WGS Illumina reads simulations; - [RADinitio](http://catchenlab.life.illinois.edu/radinitio/) in [ cristaniguti/radinitio:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/radinitio): RADseq Illumina reads simulation; - [SuperMASSA](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0030906) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Efficient Exact Maximum a Posteriori Computation for Bayesian SNP Genotyping in Polyploids; - [bcftools](https://github.com/samtools/bcftools) in [lifebitai/bcftools:1.10.2](https://hub.docker.com/r/lifebitai/bcftools): utilities for variant calling and manipulating VCFs and BCFs; - [vcftools](http://vcftools.sourceforge.net/) in [cristaniguti/split_markers:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/split_markers): program package designed for working with VCF files. - [MCHap](https://github.com/PlantandFoodResearch/MCHap) in [cristaniguti/mchap:0.7.0](https://hub.docker.com/repository/docker/cristaniguti/mchap): Polyploid micro-haplotype assembly using Markov chain Monte Carlo simulation. ### R packages - [OneMap](https://github.com/augusto-garcia/onemap) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Is a software for constructing genetic maps in experimental crosses: full-sib, RILs, F2 and backcrosses; - [Reads2MapTools](https://github.com/Cristianetaniguti/Reads2MapTools) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Support package to perform mapping populations simulations and genotyping for OneMap genetic map building - [GUSMap](https://github.com/tpbilton/GUSMap): Genotyping Uncertainty with Sequencing data and linkage MAPping - [updog](https://github.com/dcgerard/updog) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Flexible Genotyping of Polyploids using Next Generation Sequencing Data - [polyRAD](https://github.com/lvclark/polyRAD) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Genotype Calling with Uncertainty from Sequencing Data in Polyploids - [Reads2MapApp](https://github.com/Cristianetaniguti/Reads2MapApp) in [cristaniguti/reads2mapApp:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Shiny app to evaluate Reads2Map workflows results - [simuscopR](https://github.com/Cristianetaniguti/simuscopR) in [cristaniguti/reads2map:0.0.1](https://hub.docker.com/repository/docker/cristaniguti/reads2map): Wrap-up R package for SimusCop simulations.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.410.1" ; ns1:keywords "linkage_map, variant_calling, WDL, reads_simulation" ; ns1:license ; ns1:name "SimulatedReads2Map" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2022-12-06T19:17:17Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:04:54Z"^^ns1:Date ; ns1:description """# RNA-seq Scientific Workflow Workflow for RNA sequencing using the Parallel Scripting Library - Parsl. **Reference:** Cruz, L., Coelho, M., Terra, R., Carvalho, D., Gadelha, L., Osthoff, C., & Ocaña, K. (2021). *Workflows* Científicos de RNA-Seq em Ambientes Distribuídos de Alto Desempenho: Otimização de Desempenho e Análises de Dados de Expressão Diferencial de Genes. In *Anais do XV Brazilian e-Science Workshop*, p. 57-64. Porto Alegre: SBC. DOI: https://doi.org/10.5753/bresci.2021.15789 ## Requirements In order to use RNA-seq Workflow the following tools must be available: - [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) You can install Bowtie2 by running: > bowtie2-2.3.5.1-linux-x86_64.zip Or > sudo yum install bowtie2-2.3.5-linux-x86_64 - [Samtools](http://www.htslib.org/) Samtools is a suite of programs for interacting with high-throughput sequencing data. - [Picard](https://github.com/broadinstitute/picard) Picard is a set of Java command line tools for manipulating high-throughput sequencing (HTS) data and formats. - [HTSeq](https://htseq.readthedocs.io/en/master/) HTSeq is a native Python library that folows conventions of many Python packages. You can install it by running: > pip install HTSeq HTSeq uses [NumPy](https://numpy.org/), [Pysam](https://github.com/pysam-developers/pysam) and [matplotlib](https://matplotlib.org/). Be sure this tools are installed. - [R](https://www.r-project.org/) To use [DESEq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) script make sure R language is also installed. You can install it by running: > sudo apt install r-base - [Parsl - Parallel Scripting Library](https://parsl.readthedocs.io/en/stable/index.html) The recommended way to install Parsl is the suggest approach from Parsl's documentation: > python3 -m pip install parsl - [Python (version >= 3.5)](https://www.python.org/) To use Parsl, you need Python 3.5 or above. You also need Python to use HTSeq, so you should load only one Python version. ## Workflow invocation First of all, make a Comma Separated Values (CSV) file. So, onto the first line type: ``sampleName,fileName,condition``. **Remember, there must be no spaces between items**. You can use the file *"table.csv"* in this repository as an example. Your CSV file will be like this: | sampleName | fileName |condition| |------------------|------------------|---------| | tissue control 1 | SRR5445794.merge.count | control | | tissue control 2 | SRR5445795.merge.count | control | | tissue control 3 | SRR5445796.merge.count | control | | tissue wntup 1 | SRR5445797.merge.count | wntup | | tissue wntup 2 | SRR5445798.merge.count | wntup | | tissue wntup 3 | SRR5445799.merge.count | wntup | The list of command line arguments passed to Python script, beyond the script's name, must be: 1. The indexed genome; 2. The number of threads for bowtie task, sort task, number of splitted files for split_picard task and number of CPU running in htseq task; 3. Path to read fastaq file, which is the path of the input files; 4. Directory's name where the output files must be placed; 5. GTF file; 7. and, lastly the DESeq script. Make sure all the files necessary to run the workflow are in the same directory and the fastaq files in a dedicated folder, as a input directory. The command line will be like this: > python3 rna-seq.py ../mm9/mm9 24 ../inputs/ ../outputs ../Mus_musculus.NCBIM37.67.gtf ../DESeq.R **Remember to adjust the parameter multithreaded and multicore according with your computational environment.** Example: If your machine has 8 cores, you should set the parameter on 8. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.411.1" ; ns1:keywords "" ; ns1:license ; ns1:name "ParslRNA-seq Scientific Workflow" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "KO Fastq Files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "WT Fastq Files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2022-12-06T19:22:16Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:05:45Z"^^ns1:Date ; ns1:description """RNAseq workflow UMG: Here we introduce a scientific workflow implementing several open-source software executed by Galaxy parallel scripting language in an high-performance computing environment. We have applied the workflow to a single-cardiomyocyte RNA-seq data retrieved from Gene Expression Omnibus database. The workflow allows for the analysis (alignment, QC, sort and count reads, statistics generation) of raw RNA-seq data and seamless integration of differential expression results into a configurable script code. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.412.1" ; ns1:input , , ; ns1:keywords "" ; ns1:license ; ns1:name "RNAseq_UMG_SDumont_v1" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2022-12-14T16:03:30Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:04:58Z"^^ns1:Date ; ns1:description """Example workflow which allows the use of Mothra Accepts (e.g.) [these](https://github.com/machine-shop/mothra-data/tree/main/test_images) input files, bundled as a collection.""" ; ns1:keywords "" ; ns1:license ; ns1:name "Mothra" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-01-03T09:09:05Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:04:58Z"^^ns1:Date ; ns1:description """The Regulatory Mendelian Mutation (ReMM) score was created for relevance prediction of non-coding variations (SNVs and small InDels) in the human genome (GRCh37) in terms of Mendelian diseases. This project updates the ReMM score for the genome build GRCh38 and combines GRCh37 and GRCh38 into one workflow. ## Pre-requirements ### Conda We use Conda as software and dependency management tool. Conda installation guidelines can be found here: https://conda.io/projects/conda/en/latest/user-guide/install/index.html ### Additional programs These programs are used during the workflow. They usually need to be compiled, however, the repository already contains the executables or generated files. - [AttributeDB](https://github.com/visze/attributedb) - [Jannovar](https://github.com/charite/jannovar) - [parSMURF](https://github.com/AnacletoLAB/parSMURF) ### Snakemake The workflow is managed by Snakemake - a workflow management system used to create reproducible and scalable data analyses. To install Snakemake as well as all other required packages, you need to create a working environment according to the description in the file env/ReMM.yaml. For that, first Clone the repository ``` git clone https://github.com/kircherlab/ReMM cd ReMM ``` Create a working environment and activate it ``` conda env create -n ReMM --file workflow/envs/ReMM.yaml conda activate ReMM ``` All paths are relative to the Snakemake file so you do not need to change any path variables. Additionally, Snakemake creates all missing directories, so no need to create any aditional folders either. ## Workflow The workflow consists of four main parts: - Download of feature data - Data processing and cleaning - Model training and validation - Calculation of ReMM for the whole genome The `workflow` folder contains a graph of the workflow and more detailed information on the most important steps. To launch a snakemake workflow, you need to tell snakemake which file you want to generate. We defined all rules for multiple steps. They can be found here: `workflow/Snakefile`. For example, you want to generate all feature sets defined in a config file you can run: ``` snakemake -c1 all_feature_sets ``` To execute any step separately (see `README.md` in the `workflow` folder for details on workflow steps), you need to look up the name of the desired output file in the scripts and call Snakemake with the exact name. Using a flag `-n`, you can initiate a 'dry run': Snakemake will check the consistency of all rules and files and show the number of steps. However, a clean dry run does not necessarily mean that no errors will occur during a normal run. ReMM score is not allele-specific so that you get only one score independent of the variant itself. The workflow from the download of data up to computing the scores may take several days or weeks depending on the computing power and internet connection. ### The config files The main config file can be found in `config/config.yaml`. This config file was used to generate the ReMM score. Here most of the configuration magic happens. There is a second config file `config/features.yaml` where all features are listed (with additional description). Config files are controled via [json-schema](http://json-schema.org). We also provide a slurm config file for runtimes, memory and number of threads per rule: `config/slurm.yaml`. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.414.1" ; ns1:keywords "non-coding, pathogenicity score, variant pathogenicity prediction, Snakemake, ReMM, Regulatory Mendelian Mutation score" ; ns1:license ; ns1:name "ReMM score" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "contour_levels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "dec" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "do_cone_search" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "level_threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ra" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "radius" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "t1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "t2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "asciicat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "contours" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "skymap_files" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-01-11T12:22:37Z"^^ns1:Date ; ns1:dateModified "2023-01-16T14:05:00Z"^^ns1:Date ; ns1:description "" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.415.1" ; ns1:image ; ns1:input , , , , , , , ; ns1:isPartOf ; ns1:keywords "astronomy, Gravitational Waves, FAIR workflows" ; ns1:license ; ns1:name "Gravitational Wave source Cone Search" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-01-13T14:30:04Z"^^ns1:Date ; ns1:dateModified "2023-03-14T14:35:27Z"^^ns1:Date ; ns1:description """ # Github: https://github.com/Lcornet/GENERA # BCCM GEN-ERA tools repository Please visit the wiki for tutorials and access to the tools: https://github.com/Lcornet/GENERA/wiki # NEWS Mantis is now installed in a singularity container for the Metabolic workflow (install is no longer necessary). # Information about the GEN-ERA project Please visit https://bccm.belspo.be/content/bccm-collections-genomic-era # Publications 1. ToRQuEMaDA: tool for retrieving queried Eubacteria, metadata and dereplicating assemblies. Léonard, R. R., Leleu, M., Vlierberghe, M. V., Cornet, L., Kerff, F., and Baurain, D. (2021). PeerJ 9, e11348. doi:10.7717/peerj.11348. https://peerj.com/articles/11348/ 2. The taxonomy of the Trichophyton rubrum complex: a phylogenomic approach. Cornet, L., D’hooge, E., Magain, N., Stubbe, D., Packeu, A., Baurain, D., and Becker P. (2021). Microbial Genomics 7, 000707. doi:10.1099/mgen.0.000707. https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000707 3. ORPER: A Workflow for Constrained SSU rRNA Phylogenies. Cornet, L., Ahn, A.-C., Wilmotte, A., and Baurain, D. (2021). Genes 12, 1741. doi:10.3390/genes12111741. https://www.mdpi.com/2073-4425/12/11/1741/html 4. AMAW: automated gene annotation for non-model eukaryotic genomes. Meunier, L., Baurain, D., Cornet, L. (2021) https://www.biorxiv.org/content/10.1101/2021.12.07.471566v1 5. Phylogenomic analyses of Snodgrassella isolates from honeybees and bumblebees reveals taxonomic and functional diversity. Cornet, L., Cleenwerck, I., Praet, J., Leonard, R., Vereecken, N.J., Michez, D., Smagghe, G., Baurain, D., Vandamme, P. (2021) https://www.biorxiv.org/content/10.1101/2021.12.10.472130v1 6. Contamination detection in genomic data: more is not enough. Cornet, L & Baurain, D (2022) Genome Biology. 2022;23:60. https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02619-9 7. The GEN-ERA toolbox: unified and reproducible workflows for research in microbial genomics Cornet, L., Durieu, B., Baert, F., D’hooge, E., Colignon, D., Meunier, L., Lupo, V., Cleenwerck I., Daniel, HM., Rigouts, L., Sirjacobs, D., Declerck, D., Vandamme, P., Wilmotte, A., Baurain, D., Becker P (2022). https://www.biorxiv.org/content/10.1101/2022.10.20.513017v1 8. CRitical Assessment of genomic COntamination detection at several Taxonomic ranks (CRACOT) Cornet, L., Lupo, V., Declerck, S., Baurain, D. (2022). https://www.biorxiv.org/content/10.1101/2022.11.14.516442v1 # Copyright and License This softwares is copyright (c) 2017-2021 by University of Liege / Sciensano / BCCM collection by Luc CORNET This is free softwares; you can redistribute it and/or modify. ![BCCM](https://github.com/Lcornet/GENERA/blob/main/images/GENERA-logo.png) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.416.1" ; ns1:keywords "" ; ns1:license ; ns1:name "GEN-ERA toolbox" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Patrick Sorn" . a ns1:Person ; ns1:name "Thomas Bukur" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-01-17T15:06:13Z"^^ns1:Date ; ns1:dateModified "2023-01-17T15:06:13Z"^^ns1:Date ; ns1:description """![CoVigator logo](images/CoVigator_logo_txt_nobg.png "CoVigator logo") # CoVigator pipeline: variant detection pipeline for Sars-CoV-2 [![DOI](https://zenodo.org/badge/374669617.svg)](https://zenodo.org/badge/latestdoi/374669617) [![Run tests](https://github.com/TRON-Bioinformatics/covigator-ngs-pipeline/actions/workflows/automated_tests.yml/badge.svg?branch=master)](https://github.com/TRON-Bioinformatics/covigator-ngs-pipeline/actions/workflows/automated_tests.yml) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.nextflow.io/) [![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT) The Covigator pipeline processes SARS-CoV-2 FASTQ or FASTA files into annotated and normalized analysis ready VCF files. It also classifies samples into lineages using pangolin. The pipeline is implemented in the Nextflow framework (Di Tommaso, 2017), it is a stand-alone pipeline that can be used independently of the CoVigator dashboard and knowledge base. Although it is configured by default for SARS-CoV-2 it can be employed for the analysis of other microbial organisms if the required references are provided. The result of the pipeline is one or more annotated VCFs with the list of SNVs and indels ready for analysis. The results from the CoVigator pipeline populate our CoVigator dashboard [https://covigator.tron-mainz.de](https://covigator.tron-mainz.de) **Table of Contents** 1. [Two pipelines in one](#id1) 2. [Implementation](#id2) 3. [How to run](#id3) 4. [Understanding the output](#id4) 6. [Annotation resources](#id5) 7. [Future work](#id6) 8. [Bibliography](#id7) ## Two pipelines in one In CoVigator we analyse samples from two different formats, FASTQ files (e.g.: as provided by the European Nucleotide Archive) and FASTA files containing a consensus assembly. While from the first we get the raw reads, from the second we obtain already assembled genomes. Each of these formats has to be analysed differently. Also, the output data that we can obtain from each of these is different. ![CoVigator pipeline](images/pipeline.drawio.png) ### Pipeline for FASTQ files When FASTQ files are provided the pipeline includes the following steps: - **Trimming**. `fastp` is used to trim reads with default values. This step also includes QC filtering. - **Alignment**. `BWA mem 2` is used for the alignment of single or paired end samples. - **BAM preprocessing**. BAM files are prepared and duplicate reads are marked using GATK and Sambamba tools. - **Primer trimming**. When a BED with primers is provided, these are trimmed from the reads using iVar. This is applicable to the results from all variant callers. - **Coverage analysis**. `samtools coverage` and `samtools depth` are used to compute the horizontal and vertical coverage respectively. - **Variant calling**. Four different variant callers are employed: BCFtools, LoFreq, iVar and GATK. Subsequent processing of resulting VCF files is independent for each caller. - **Variant normalization**. `bcftools norm` is employed to left align indels, trim variant calls and remove variant duplicates. - **Technical annotation**. `VAFator` is employed to add VAF and coverage annotations from the reads pileup. - **Phasing**. Clonal mutations (ie: VAF >= 0.8) occurring in the same amino acid are merged for its correct functional annotation. - **Biological annotation**. `SnpEff` is employed to annotate the variant consequences of variants and `bcftools annotate` is employed to add additional SARS-CoV-2 annotations. - **Lineage determination**. `pangolin` is used for this purpose, this runs over the results from each of the variant callers separately. Both single end and paired end FASTQ files are supported. ### Pipeline for FASTA files When a FASTA file is provided with a single assembly sequence the pipeline includes the following steps: - **Variant calling**. A Smith-Waterman global alignment is performed against the reference sequence to call SNVs and indels. Indels longer than 50 bp and at the beginning or end of the assembly sequence are excluded. Any mutation where either reference or assembly contain an N is excluded. - **Variant normalization**. Same as described above. - **Phasing**. mutations occurring in the same amino acid are merged for its correct annotation. - **Biological annotation**. Same as described above. - **Lineage determination**. `pangolin` is used for this purpose. The FASTA file is expected to contain a single assembly sequence. Bear in mind that only clonal variants can be called on the assembly. ### Pipeline for VCF files When a VCF file is provided the pipeline includes the following steps: - **Variant normalization**. Same as described above. - **Technical annotation**. Same as described above (optional if BAM is provided) - **Phasing**. mutations occurring in the same amino acid are merged for its correct annotation. - **Biological annotation**. Same as described above - **Lineage determination**. `pangolin` is used for this purpose. ## Implementation The pipeline is implemented as a Nextflow workflow with its DSL2 syntax. The dependencies are managed through a conda environment to ensure version traceability and reproducibility. The references for SARS-CoV-2 are embedded in the pipeline. The pipeline is based on a number of third-party tools, plus a custom implementation based on biopython (Cock, 2009) for the alignment and subsequent variant calling over a FASTA file. All code is open sourced in GitHub [https://github.com/TRON-Bioinformatics/covigator-ngs-pipeline](https://github.com/TRON-Bioinformatics/covigator-ngs-pipeline) and made available under the MIT license. We welcome any contribution. If you have troubles using the CoVigator pipeline or you find an issue, we will be thankful if you would report a ticket in GitHub. The alignment, BAM preprocessing and variant normalization pipelines are based on the implementations in additional Nextflow pipelines within the TronFlow initiative [https://tronflow-docs.readthedocs.io/](https://tronflow-docs.readthedocs.io/). ### Variant annotations The variants derived from a FASTQ file are annotated on the `FILTER` column using the VAFator (https://github.com/TRON-Bioinformatics/vafator) variant allele frequency (VAF) into `LOW_FREQUENCY`, `SUBCLONAL`, `LOW_QUALITY_CLONAL` and finally `PASS` variants correspond to clonal variants. By default, variants with a VAF < 2 % are considered `LOW_FREQUENCY`, variants with a VAF >= 2 % and < 50 % are considered `SUBCLONAL` and variants with a VAF >= 50 % and < 80 % are considered `LOW_QUALITY_CLONAL`. This thresholds can be changed with the parameters `--low_frequency_variant_threshold`, `--subclonal_variant_threshold` and `--low_quality_clonal_variant_threshold` respectively. VAFator technical annotations: - `INFO/vafator_af`: variant allele frequency of the mutation - `INFO/vafator_ac`: number of reads supporting the mutation - `INFO/vafator_dp`: total number of reads at the position, in the case of indels this represents the number of reads in the previous position SnpEff provides the functional annotations. And all mutations are additionally annotated with the following SARS-CoV-2 specific annotations: - ConsHMM conservation scores as reported in (Kwon, 2021) - Pfam domains as reported in Ensemble annotations. Biological annotations: - `INFO/ANN` are the SnpEff consequence annotations (eg: overlapping gene, effect of the mutation). This are described in detail here [http://pcingola.github.io/SnpEff/se_inputoutput/](http://pcingola.github.io/SnpEff/se_inputoutput/) - `INFO/CONS_HMM_SARS_COV_2` is the ConsHMM conservation score in SARS-CoV-2 - `INFO/CONS_HMM_SARBECOVIRUS` is the ConsHMM conservation score among Sarbecovirus - `INFO/CONS_HMM_VERTEBRATE_COV` is the ConsHMM conservation score among vertebrate Corona virus - `INFO/PFAM_NAME` is the Interpro name for the overlapping Pfam domains - `INFO/PFAM_DESCRIPTION` is the Interpro description for the overlapping Pfam domains - `INFO/problematic` contains the filter provided in DeMaio et al. (2020) for problematic mutations According to DeMaio et al. (2020), mutations at the beginning (ie: POS <= 50) and end (ie: POS >= 29,804) of the genome are filtered out This is an example of biological annotations of a missense mutation in the spike protein on the N-terminal subunit 1 domain. ``` ANN=A|missense_variant|MODERATE|S|gene-GU280_gp02|transcript|TRANSCRIPT_gene-GU280_gp02|protein_coding|1/1|c.118G>A| p.D40N|118/3822|118/3822|40/1273||;CONS_HMM_SARS_COV_2=0.57215;CONS_HMM_SARBECOVIRUS=0.57215;CONS_HMM_VERTEBRATE_COV=0; PFAM_NAME=bCoV_S1_N;PFAM_DESCRIPTION=Betacoronavirus-like spike glycoprotein S1, N-terminal ``` ### Phasing limitations The phasing implementation is applicable only to clonal mutations. It assumes all clonal mutations are in phase and hence it merges those occurring in the same amino acid. In order to phase intrahost mutations we would need to implement a read-backed phasing approach such as in WhatsHap or GATK's ReadBackedPhasing. Unfortunately these tools do not support the scenario of a haploid organism with an undefined number of subclones. For this reason, phasing is implemented with custom Python code at `bin/phasing.py`. ### Primers trimming With some library preparation protocols such as ARTIC it is recommended to trim the primers from the reads. We have observed that if primers are not trimmed spurious mutations are being called specially SNVs with lower frequencies and long deletions. Also the variant allele frequencies of clonal mutations are underestimated. The BED files containing the primers for each ARTIC version can be found at https://github.com/artic-network/artic-ncov2019/tree/master/primer_schemes/nCoV-2019. If the adequate BED file is provided to the CoVigator pipeline with `--primers` the primers will be trimmed with iVar. This affects the output of every variant caller, not only iVar. ### Reference data The default SARS-CoV-2 reference files correspond to Sars_cov_2.ASM985889v3 and were downloaded from Ensembl servers. No additional parameter needs to be provided to use the default SARS-CoV-2 reference genome. #### Using a custom reference genome These references can be customised to use a different SARS-CoV-2 reference or to analyse a different virus. Two files need to be provided: - Use a custom reference genome by providing the parameter `--reference your.fasta`. - Gene annotation file in GFFv3 format `--gff your.gff`. This is only required to run iVar Additionally, the FASTA needs bwa indexes, .fai index and a .dict index. These indexes can be generated with the following two commands: ``` bwa index reference.fasta samtools faidx reference.fasta gatk CreateSequenceDictionary --REFERENCE your.fasta ``` **NOTE**: beware that for Nextflow to find these indices the reference needs to be passed as an absolute path. The SARS-CoV-2 specific annotations will be skipped when using a custom genome. In order to have SnpEff functional annotations available you will also need to provide three parameters: - `--snpeff_organism`: organism to annotate with SnpEff (ie: as registered in SnpEff) - `--snpeff_data`: path to the SnpEff data folder - `--snpeff_config`: path to the SnpEff config file ### Intrahost mutations Some mutations may be observed in a subset of the virus sample, this may arise through intrahost virus evolution or co-infection. Intrahost mutations can only be detected when analysing the raw reads (ie: the FASTQs) as in the assembly (ie: the FASTA file) a single virus consensus sequence is represented. BCFtools and GATK do not normally capture intrahost mutations; on the other hand LoFreq and iVar both capture mutations that deviate from a clonal-like VAF. Nevertheless, mutations with lower variant allele frequency (VAF) are challenging to distinguish from sequencing and analytical errors. Mutations are annotated on the `FILTER` column using the VAF into three categories: - `LOW_FREQUENCY`: subset of intrahost mutations with lowest frequencies, potentially enriched with false positive calls (VAF < 2 %). - `SUBCLONAL`: subset of intrahost mutations with higher frequencies (2 % <= VAF < 50 %). - `LOW_QUALITY_CLONAL`: subset of clonal mutations with lower frequencies (50 % <= VAF < 80 %). - `PASS` clonal mutations (VAF >= 80 %) Other low quality mutations are removed from the output. The VAF thresholds can be changed with the parameters `--low_frequency_variant_threshold`, `--subclonal_variant_threshold` and `--low_quality_clonal_variant_threshold`. ## How to run ### Requirements - Nextflow >= 19.10.0 - Java >= 8 - Conda >=4.9 ### Testing To run the workflow on a test assembly dataset run: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda,test_fasta ``` Find the output in the folder `covigator_test_fasta`. To run the workflow on a test raw reads dataset run: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda,test_fastq ``` Find the output in the folder `covigator_test_fastq`. The above commands are useful to create the conda environments beforehand. **NOTE**: pangolin is the most time-consuming step of the whole pipeline. To make it faster, locate the conda environment that Nextflow created with pangolin (eg: `find $YOUR_NEXTFOW_CONDA_ENVS_FOLDER -name pangolin`) and run `pangolin --decompress-model`. ### Running For paired end reads: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline \\ [-r v0.10.0] \\ [-profile conda] \\ --fastq1 \\ --fastq2 \\ --name example_run \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` For single end reads: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline \\ [-r v0.10.0] \\ [-profile conda] \\ --fastq1 \\ --name example_run \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` For assembly: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline \\ [-r v0.10.0] \\ [-profile conda] \\ --fasta \\ --name example_run \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` For VCF: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline \\ [-r v0.10.0] \\ [-profile conda] \\ --vcf \\ --name example_run \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` As an optional input when processing directly VCF files you can provide BAM files to annotate VAFs: ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline \\ [-r v0.10.0] \\ [-profile conda] \\ --vcf \\ --bam \\ --bai \\ --name example_run \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` For batch processing of reads use `--input_fastqs_list` and `--name`. ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline [-profile conda] --input_fastqs_list --library --output [--reference /Sars_cov_2.ASM985889v3.fa] [--gff /Sars_cov_2.ASM985889v3.gff3] ``` where the TSV file contains two or three columns tab-separated columns **without header**. Columns: sample name, path to FASTQ 1 and optionally path to FASTQ 2. | Sample | FASTQ 1 | FASTQ 2 (optional column) | |-----------|-------------------------------|-------------------------------| | sample1 | /path/to/sample1_fastq1.fastq | /path/to/sample1_fastq2.fastq | | sample2 | /path/to/sample2_fastq1.fastq | /path/to/sample2_fastq2.fastq | | ... | ... | ... | For batch processing of assemblies use `--input_fastas_list`. ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline [-profile conda] --input_fastas_list --library --output [--reference /Sars_cov_2.ASM985889v3.fa] [--gff /Sars_cov_2.ASM985889v3.gff3] ``` where the TSV file contains two columns tab-separated columns **without header**. Columns: sample name and path to FASTA. | Sample | FASTA | |-----------|------------------------| | sample1 | /path/to/sample1.fasta | | sample2 | /path/to/sample2.fasta | | ... | ... | For batch processing of VCFs use `--input_vcfs_list`. ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline [-profile conda] --input_vcfs_list --output [--reference /Sars_cov_2.ASM985889v3.fa] [--gff /Sars_cov_2.ASM985889v3.gff3] ``` where the TSV file contains two columns tab-separated columns **without header**. Columns: sample name and path to VCF. | Sample | FASTA | |-----------|------------------------| | sample1 | /path/to/sample1.vcf | | sample2 | /path/to/sample2.vcf | | ... | ... | Optionally, provide BAM files for batch processing of VCFs using `--input_bams_list`. ``` nextflow run tron-bioinformatics/covigator-ngs-pipeline [-profile conda] \\ --input_vcfs_list \\ --input_bams_list \\ --output \\ [--reference /Sars_cov_2.ASM985889v3.fa] \\ [--gff /Sars_cov_2.ASM985889v3.gff3] ``` where the BAMs TSV file contains three columns tab-separated columns **without header**. Columns: sample name, path to BAM and path to BAI. | Sample | BAM | BAI | |-----------|----------------------|----------------------| | sample1 | /path/to/sample1.bam | /path/to/sample1.bai | | sample2 | /path/to/sample2.bam | /path/to/sample2.bai | | ... | ... | ... | ### Getting help You can always contact us directly or create a GitHub issue, otherwise see all available options using `--help`: ``` $ nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda --help Usage: nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda --help Input: * --fastq1: the first input FASTQ file (not compatible with --fasta, nor --vcf) * --fasta: the FASTA file containing the assembly sequence (not compatible with --fastq1, nor --vcf) * --vcf: the VCF file containing mutations to analyze (not compatible with --fastq1, nor --fasta) * --bam: the BAM file containing reads to annotate VAFs on a VCF (not compatible with --fastq1, nor --fasta) * --bai: the BAI index for a BAM file (not compatible with --fastq1, nor --fasta) * --name: the sample name, output files will be named after this name * --output: the folder where to publish output * --input_fastqs_list: alternative to --name and --fastq1 for batch processing * --library: required only when using --input_fastqs * --input_fastas_list: alternative to --name and --fasta for batch processing * --input_vcfs_list: alternative to --name and --vcf for batch processing * --input_bams_list: alternative to --name, --vcf, --bam and --bai for batch processing Optional input only required to use a custom reference: * --reference: the reference genome FASTA file, *.fai, *.dict and bwa indexes are required. * --gff: the GFFv3 gene annotations file (required to run iVar and to phase mutations from all variant callers) * --snpeff_data: path to the SnpEff data folder, it will be useful to use the pipeline on other virus than SARS-CoV-2 * --snpeff_config: path to the SnpEff config file, it will be useful to use the pipeline on other virus than SARS-CoV-2 * --snpeff_organism: organism to annotate with SnpEff, it will be useful to use the pipeline on other virus than SARS-CoV-2 Optional input: * --fastq2: the second input FASTQ file * --primers: a BED file containing the primers used during library preparation. If provided primers are trimmed from the reads. * --min_base_quality: minimum base call quality to take a base into account for variant calling (default: 20) * --min_mapping_quality: minimum mapping quality to take a read into account for variant calling (default: 20) * --vafator_min_base_quality: minimum base call quality to take a base into account for VAF annotation (default: 0) * --vafator_min_mapping_quality: minimum mapping quality to take a read into account for VAF annotation (default: 0) * --low_frequency_variant_threshold: VAF threshold to mark a variant as low frequency (default: 0.02) * --subclonal_variant_threshold: VAF superior threshold to mark a variant as subclonal (default: 0.5) * --lq_clonal_variant_threshold: VAF superior threshold to mark a variant as loq quality clonal (default: 0.8) * --memory: the ammount of memory used by each job (default: 3g) * --cpus: the number of CPUs used by each job (default: 1) * --skip_lofreq: skips calling variants with LoFreq * --skip_gatk: skips calling variants with GATK * --skip_bcftools: skips calling variants with BCFTools * --skip_ivar: skips calling variants with iVar * --skip_pangolin: skips lineage determination with pangolin * --match_score: global alignment match score, only applicable for assemblies (default: 2) * --mismatch_score: global alignment mismatch score, only applicable for assemblies (default: -1) * --open_gap_score: global alignment open gap score, only applicable for assemblies (default: -3) * --extend_gap_score: global alignment extend gap score, only applicable for assemblies (default: -0.1) * --skip_sarscov2_annotations: skip some of the SARS-CoV-2 specific annotations (default: false) * --keep_intermediate: keep intermediate files (ie: BAM files and intermediate VCF files) * --args_bcftools_mpileup: additional arguments for bcftools mpileup command (eg: --args_bcftools_mpileup='--ignore-overlaps') * --args_bcftools_call: additional arguments for bcftools call command (eg: --args_bcftools_call='--something') * --args_lofreq: additional arguments for lofreq command (eg: --args_lofreq='--something') * --args_gatk: additional arguments for gatk command (eg: --args_gatk='--something') * --args_ivar_samtools: additional arguments for ivar samtools mpileup command (eg: --args_ivar_samtools='--ignore-overlaps') * --args_ivar: additional arguments for ivar command (eg: --args_ivar='--something') Output: * Output a VCF file for each of BCFtools, GATK, LoFreq and iVar when FASTQ files are provided or a single VCF obtained from a global alignment when a FASTA file is provided. * A pangolin results file for each of the VCF files. * Only when FASTQs are provided: * FASTP statistics * Depth and breadth of coverage analysis results ``` ## Understanding the output Although the VCFs are normalized for both pipelines, the FASTQ pipeline runs four variant callers, while the FASTA pipeline runs a single variant caller. Also, there are several metrics in the FASTQ pipeline that are not present in the output of the FASTA pipeline. Here we will describe these outputs. ### FASTQ pipeline output Find in the table below a description of each of the expected files and a link to a sample file for the FASTQ pipeline. The VCF files will be described in more detail later. | Name | Description | Sample file | |---------------------------------|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| | $NAME.fastp_stats.json | Output metrics of the fastp trimming process in JSON format | [ERR4145453.fastp_stats.json](_static/covigator_pipeline_sample_output_reads/ERR4145453.fastp_stats.json) | | $NAME.fastp_stats.html | Output metrics of the fastp trimming process in HTML format | [ERR4145453.fastp_stats.html](_static/covigator_pipeline_sample_output_reads/ERR4145453.fastp_stats.html) | | $NAME.deduplication_metrics.txt | Deduplication metrics | [ERR4145453.deduplication_metrics.txt](_static/covigator_pipeline_sample_output_reads/ERR4145453.deduplication_metrics.txt) | | $NAME.coverage.tsv | Coverage metrics (eg: mean depth, % horizontal coverage) | [ERR4145453.coverage.tsv](_static/covigator_pipeline_sample_output_reads/ERR4145453.coverage.tsv) | | $NAME.depth.tsv | Depth of coverage per position | [ERR4145453.depth.tsv](_static/covigator_pipeline_sample_output_reads/ERR4145453.depth.tsv) | | $NAME.bcftools.vcf.gz | Bgzipped, tabix-indexed and annotated output VCF from BCFtools | [ERR4145453.bcftools.normalized.annotated.vcf.gz](_static/covigator_pipeline_sample_output_reads/ERR4145453.bcftools.normalized.annotated.vcf.gz) | | $NAME.gatk.vcf.gz | Bgzipped, tabix-indexed and annotated output VCF from GATK | [ERR4145453.gatk.normalized.annotated.vcf.gz](_static/covigator_pipeline_sample_output_reads/ERR4145453.gatk.normalized.annotated.vcf.gz) | | $NAME.lofreq.vcf.gz | Bgzipped, tabix-indexed and annotated output VCF from LoFreq | [ERR4145453.lofreq.normalized.annotated.vcf.gz](_static/covigator_pipeline_sample_output_reads/ERR4145453.lofreq.normalized.annotated.vcf.gz) | | $NAME.ivar.vcf.gz | Bgzipped, tabix-indexed and annotated output VCF from LoFreq | [ERR4145453.ivar.tsv](_static/covigator_pipeline_sample_output_reads/ERR4145453.ivar.tsv) | | $NAME.lofreq.pangolin.csv | Pangolin CSV output file derived from LoFreq mutations | [ERR4145453.lofreq.pangolin.csv](_static/covigator_pipeline_sample_output_reads/ERR4145453.lofreq.pangolin.csv) | ### FASTA pipeline output The FASTA pipeline returns a single VCF file. The VCF files will be described in more detail later. | Name | Description | Sample file | |-----------------------------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------| | $NAME.assembly.vcf.gz | Bgzipped, tabix-indexed and annotated output VCF | [ERR4145453.assembly.normalized.annotated.vcf.gz](_static/covigator_pipeline_sample_output_assembly/hCoV-19_NTXX.assembly.normalized.annotated.vcf.gz) | ## Annotations resources SARS-CoV-2 ASM985889v3 references were downloaded from Ensembl on 6th of October 2020: - ftp://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/dna/Sars_cov_2.ASM985889v3.dna.toplevel.fa.gz - ftp://ftp.ensemblgenomes.org/pub/viruses/gff3/sars_cov_2/Sars_cov_2.ASM985889v3.101.gff3.gz ConsHMM mutation depletion scores downloaded on 1st of July 2021: - https://github.com/ernstlab/ConsHMM_CoV/blob/master/wuhCor1.mutDepletionConsHMM.bed - https://github.com/ernstlab/ConsHMM_CoV/blob/master/wuhCor1.mutDepletionSarbecovirusConsHMM.bed - https://github.com/ernstlab/ConsHMM_CoV/blob/master/wuhCor1.mutDepletionVertebrateCoVConsHMM.bed Gene annotations including Pfam domains downloaded from Ensembl on 25th of February 2021 from: - ftp://ftp.ensemblgenomes.org/pub/viruses/json/sars_cov_2/sars_cov_2.json ## Future work - Primer trimming on an arbitrary sequencing library. - Pipeline for Oxford Nanopore technology. - Variant calls from assemblies contain an abnormally high number of deletions of size greater than 3 bp. This is a technical artifact that would need to be avoided. ## Bibliography - Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316–319. https://doi.org/10.1038/nbt.3820 - Vasimuddin Md, Sanchit Misra, Heng Li, Srinivas Aluru. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. - Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. Unified Representation of Genetic Variants. Bioinformatics (2015) 31(13): 2202-2204](http://bioinformatics.oxfordjournals.org/content/31/13/2202) and uses bcftools [Li, H. (2011). A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics (Oxford, England), 27(21), 2987–2993. 10.1093/bioinformatics/btr509 - Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. - Van der Auwera GA, Carneiro M, Hartl C, Poplin R, del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella K, Altshuler D, Gabriel S, DePristo M. (2013). From FastQ Data to High-Confidence Variant Calls: The Genome Analysis Toolkit Best Practices Pipeline. Curr Protoc Bioinformatics, 43:11.10.1-11.10.33. DOI: 10.1002/0471250953.bi1110s43. - Martin, M., Patterson, M., Garg, S., O Fischer, S., Pisanti, N., Klau, G., Schöenhuth, A., & Marschall, T. (2016). WhatsHap: fast and accurate read-based phasing. BioRxiv, 085050. https://doi.org/10.1101/085050 - Danecek, P., & McCarthy, S. A. (2017). BCFtools/csq: haplotype-aware variant consequences. Bioinformatics, 33(13), 2037–2039. https://doi.org/10.1093/bioinformatics/btx100 - Wilm, A., Aw, P. P. K., Bertrand, D., Yeo, G. H. T., Ong, S. H., Wong, C. H., Khor, C. C., Petric, R., Hibberd, M. L., & Nagarajan, N. (2012). LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Research, 40(22), 11189–11201. https://doi.org/10.1093/nar/gks918 - Grubaugh, N. D., Gangavarapu, K., Quick, J., Matteson, N. L., De Jesus, J. G., Main, B. J., Tan, A. L., Paul, L. M., Brackney, D. E., Grewal, S., Gurfield, N., Van Rompay, K. K. A., Isern, S., Michael, S. F., Coffey, L. L., Loman, N. J., & Andersen, K. G. (2019). An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology, 20(1), 8. https://doi.org/10.1186/s13059-018-1618-7 - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 - Kwon, S. Bin, & Ernst, J. (2021). Single-nucleotide conservation state annotation of the SARS-CoV-2 genome. Communications Biology, 4(1), 1–11. https://doi.org/10.1038/s42003-021-02231-w - Cock, P. J., Antao, T., Chang, J. T., Chapman, B. A., Cox, C. J., Dalke, A., et al. (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics, 25(11), 1422–1423. - Artem Tarasov, Albert J. Vilella, Edwin Cuppen, Isaac J. Nijman, Pjotr Prins, Sambamba: fast processing of NGS alignment formats, Bioinformatics, Volume 31, Issue 12, 15 June 2015, Pages 2032–2034, https://doi.org/10.1093/bioinformatics/btv098 """ ; ns1:image ; ns1:keywords "Bioinformatics, SARS-CoV-2, covid-19, variant calling, Nextflow" ; ns1:license ; ns1:name "CoVigator pipeline: variant detection pipeline for Sars-CoV-2 (and other viruses...)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-01-17T16:51:42Z"^^ns1:Date ; ns1:dateModified "2023-01-17T16:51:42Z"^^ns1:Date ; ns1:description """# TronFlow alignment pipeline ![GitHub tag (latest SemVer)](https://img.shields.io/github/v/release/tron-bioinformatics/tronflow-bwa?sort=semver) [![Run tests](https://github.com/TRON-Bioinformatics/tronflow-bwa/actions/workflows/automated_tests.yml/badge.svg?branch=master)](https://github.com/TRON-Bioinformatics/tronflow-bwa/actions/workflows/automated_tests.yml) [![DOI](https://zenodo.org/badge/327943420.svg)](https://zenodo.org/badge/latestdoi/327943420) [![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT) [![Powered by Nextflow](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.nextflow.io/) The TronFlow alignment pipeline is part of a collection of computational workflows for tumor-normal pair somatic variant calling. Find the documentation here [![Documentation Status](https://readthedocs.org/projects/tronflow-docs/badge/?version=latest)](https://tronflow-docs.readthedocs.io/en/latest/?badge=latest) This pipeline aligns paired and single end FASTQ files with BWA aln and mem algorithms and with BWA mem 2. For RNA-seq STAR is also supported. To increase sensitivity of novel junctions use `--star_two_pass_mode` (recommended for RNAseq variant calling). It also includes an initial step of read trimming using FASTP. ## How to run it Run it from GitHub as follows: ``` nextflow run tron-bioinformatics/tronflow-alignment -profile conda --input_files $input --output $output --algorithm aln --library paired ``` Otherwise download the project and run as follows: ``` nextflow main.nf -profile conda --input_files $input --output $output --algorithm aln --library paired ``` Find the help as follows: ``` $ nextflow run tron-bioinformatics/tronflow-alignment --help N E X T F L O W ~ version 19.07.0 Launching `main.nf` [intergalactic_shannon] - revision: e707c77d7b Usage: nextflow main.nf --input_files input_files [--reference reference.fasta] Input: * input_fastq1: the path to a FASTQ file (incompatible with --input_files) * input_files: the path to a tab-separated values file containing in each row the sample name and two paired FASTQs (incompatible with --fastq1 and --fastq2) when `--library paired`, or a single FASTQ file when `--library single` Example input file: name1 fastq1.1 fastq1.2 name2 fastq2.1 fastq2.2 * reference: path to the indexed FASTA genome reference or the star reference folder in case of using star Optional input: * input_fastq2: the path to a second FASTQ file (incompatible with --input_files, incompatible with --library paired) * output: the folder where to publish output (default: output) * algorithm: determines the BWA algorithm, either `aln`, `mem`, `mem2` or `star` (default `aln`) * library: determines whether the sequencing library is paired or single end, either `paired` or `single` (default `paired`) * cpus: determines the number of CPUs for each job, with the exception of bwa sampe and samse steps which are not parallelized (default: 8) * memory: determines the memory required by each job (default: 32g) * inception: if enabled it uses an inception, only valid for BWA aln, it requires a fast file system such as flash (default: false) * skip_trimming: skips the read trimming step * star_two_pass_mode: activates STAR two-pass mode, increasing sensitivity of novel junction discovery, recommended for RNA variant calling (default: false) * additional_args: additional alignment arguments, only effective in BWA mem, BWA mem 2 and STAR (default: none) Output: * A BAM file \\${name}.bam and its index * FASTP read trimming stats report in HTML format \\${name.fastp_stats.html} * FASTP read trimming stats report in JSON format \\${name.fastp_stats.json} ``` ### Input tables The table with FASTQ files expects two tab-separated columns without a header | Sample name | FASTQ 1 | FASTQ 2 | |----------------------|---------------------------------|------------------------------| | sample_1 | /path/to/sample_1.1.fastq | /path/to/sample_1.2.fastq | | sample_2 | /path/to/sample_2.1.fastq | /path/to/sample_2.2.fastq | ### Reference genome The reference genome has to be provided in FASTA format and it requires two set of indexes: * FAI index. Create with `samtools faidx your.fasta` * BWA indexes. Create with `bwa index your.fasta` For bwa-mem2 a specific index is needed: ``` bwa-mem2 index your.fasta ``` For star a reference folder prepared with star has to be provided. In order to prepare it will need the reference genome in FASTA format and the gene annotations in GTF format. Run a command as follows: ``` STAR --runMode genomeGenerate --genomeDir $YOUR_FOLDER --genomeFastaFiles $YOUR_FASTA --sjdbGTFfile $YOUR_GTF ``` ## References * Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler Transform. Bioinformatics, Epub. https://doi.org/10.1093/bioinformatics/btp698 * Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 * Vasimuddin Md, Sanchit Misra, Heng Li, Srinivas Aluru. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. * Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PMID: 23104886; PMCID: PMC3530905. """ ; ns1:isPartOf ; ns1:keywords "Alignment, BWA, STAR, Bioinformatics, fastp" ; ns1:license ; ns1:name "TronFlow alignment pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-01-17T16:54:14Z"^^ns1:Date ; ns1:dateModified "2023-01-17T16:54:14Z"^^ns1:Date ; ns1:description """# TronFlow BAM preprocessing pipeline ![GitHub tag (latest SemVer)](https://img.shields.io/github/v/release/tron-bioinformatics/tronflow-bam-preprocessing?sort=semver) [![Automated tests](https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing/actions/workflows/automated_tests.yml/badge.svg)](https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing/actions/workflows/automated_tests.yml) [![DOI](https://zenodo.org/badge/358400957.svg)](https://zenodo.org/badge/latestdoi/358400957) [![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT) [![Powered by Nextflow](https://img.shields.io/badge/powered%20by-Nextflow-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://www.nextflow.io/) The TronFlow BAM preprocessing pipeline is part of a collection of computational workflows for tumor-normal pair somatic variant calling. These workflows are implemented in the Nextflow (Di Tommaso, 2017) framework. Find the documentation here [![Documentation Status](https://readthedocs.org/projects/tronflow-docs/badge/?version=latest)](https://tronflow-docs.readthedocs.io/en/latest/?badge=latest) The aim of this workflow is to preprocess BAM files based on Picard and GATK (DePristo, 2011) best practices. ## Background In order to have a variant calling ready BAM file there are a number of operations that need to be applied on the BAM. This pipeline depends on the particular variant caller, but there are some common operations. GATK has been providing a well known best practices document on BAM preprocessing, the latest best practices for GATK4 (https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165) does not perform anymore realignment around indels as opposed to best practices for GATK3 (https://software.broadinstitute.org/gatk/documentation/article?id=3238). This pipeline is based on both Picard and GATK. These best practices have been implemented a number of times, see for instance this implementation in Workflow Definition Language https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl. ## Objectives We aim at providing a single implementation of the BAM preprocessing pipeline that can be used across different use cases. For this purpose there are some required steps and some optional steps. The input can be either a tab-separated values file (`--input_files`) where each line corresponds to one input BAM or a single BAM (`--input_bam` and `--input_name`). ## Implementation Steps: * **Clean BAM**. Sets the mapping quality to 0 for all unmapped reads and avoids soft clipping going beyond the reference genome boundaries. Implemented in Picard * **Reorder chromosomes**. Makes the chromosomes in the BAM follow the same order as the reference genome. Implemented in Picard * **Add read groups**. GATK requires that some headers are adde to the BAM, also we want to flag somehow the normal and tumor BAMs in the header as some callers, such as Mutect2 require it. Implemented in Picard. * **Mark duplicates** (optional). Identify the PCR and the optical duplications and marks those reads. This uses the parallelized version on Spark, it is reported to scale linearly up to 16 CPUs. * **Realignment around indels** (optional). This procedure is important for locus based variant callers, but for any variant caller doing haplotype assembly it is not needed. This is computing intensive as it first finds regions for realignment where there are indication of indels and then it performs a local realignment over those regions. Implemented in GATK3, deprecated in GATK4 * **Base Quality Score Recalibration (BQSR)** (optional). It aims at correcting systematic errors in the sequencer when assigning the base call quality errors, as these scores are used by variant callers it improves variant calling in some situations. Implemented in GATK4 * **Metrics** (optional). A number of metrics are obtained from the BAM file with Picard's CollectMetrics, CollectHsMetrics and samtools' coverage and depth. ![Pipeline](figures/bam_preprocessing2.png) ## How to run it ``` $ nextflow run tron-bioinformatics/tronflow-bam-preprocessing --help N E X T F L O W ~ version 19.07.0 Launching `main.nf` [intergalactic_shannon] - revision: e707c77d7b Usage: main.nf --input_files input_files Input: * --input_bam: the path to a single BAM (this option is not compatible with --input_files) * --input_files: the path to a tab-separated values file containing in each row the sample name, sample type (eg: tumor or normal) and path to the BAM file (this option is not compatible with --input_bam) Sample type will be added to the BAM header @SN sample name The input file does not have header! Example input file: name1 tumor tumor.1.bam name1 normal normal.1.bam name2 tumor tumor.2.bam * --reference: path to the FASTA genome reference (indexes expected *.fai, *.dict) Optional input: * --input_name: the name of the sample. Only used when --input_bam is provided (default: normal) * --dbsnp: path to the dbSNP VCF (required to perform BQSR) * --known_indels1: path to a VCF of known indels (optional to perform realignment around indels) * --known_indels2: path to a second VCF of known indels (optional to perform realignment around indels) * --intervals: path to a BED file to collect coverage and HS metrics from (default: None) * --collect_hs_minimum_base_quality: minimum base quality for a base to contribute coverage (default: 20). * --collect_hs_minimum_mapping_quality: minimum mapping quality for a read to contribute coverage (default: 20). * --skip_bqsr: optionally skip BQSR (default: false) * --skip_realignment: optionally skip realignment (default: false) * --skip_deduplication: optionally skip deduplication (default: false) * --remove_duplicates: removes duplicate reads from output BAM instead of flagging them (default: true) * --skip_metrics: optionally skip metrics (default: false) * --output: the folder where to publish output (default: ./output) * --platform: the platform to be added to the BAM header. Valid values: [ILLUMINA, SOLID, LS454, HELICOS and PACBIO] (default: ILLUMINA) Computational resources: * --prepare_bam_cpus: (default: 3) * --prepare_bam_memory: (default: 8g) * --mark_duplicates_cpus: (default: 16) * --mark_duplicates_memory: (default: 64g) * --realignment_around_indels_cpus: (default: 2) * --realignment_around_indels_memory: (default: 31g) * --bqsr_cpus: (default: 3) * --bqsr_memory: (default: 4g) * --metrics_cpus: (default: 1) * --metrics_memory: (default: 8g) Output: * Preprocessed and indexed BAMs * Tab-separated values file with the absolute paths to the preprocessed BAMs, preprocessed_bams.txt Optional output: * Recalibration report * Deduplication metrics * Realignment intervals * GATK multiple metrics * HS metrics * Horizontal and vertical coverage metrics ``` ### Input table The table with FASTQ files expects two tab-separated columns **without a header** | Sample name | Sample type | BAM | |----------------------|---------------------------------|------------------------------| | sample_1 | normal | /path/to/sample_1.normal.bam | | sample_1 | tumor | /path/to/sample_1.tumor.bam | | sample_2 | normal | /path/to/sample_2.normal.bam | | sample_2 | tumor | /path/to/sample_2.tumor.bam | The values used in `sample type` are arbitrary. These will be set in the BAM header tag @RG:SM for sample. There may be some downstream constraints, eg: Mutect2 pipeline requires that the sample type between normal and tumor samples of the same pair are not the same. ### References The BAM preprocessing workflow requires the human reference genome (`--reference`) Base Quality Score Recalibration (BQSR) requires dbSNP to avoid extracting error metrics from polymorphic sites (`--dbsnp`) Realignment around indels requires a set of known indels (`--known_indels1` and `--known_indels2`). These resources can be fetched from the GATK bundle https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle. Optionally, in order to run Picard's CollectHsMetrics a BED file will need to be provided (`--intervals`). This BED file will also be used for `samtools coverage`. ## Troubleshooting ### Too new Java version for MarkDuplicatesSpark When using Java 11 the cryptic error messsage `java.lang.IllegalArgumentException: Unsupported class file major version 55` has been observed. This issue is described here and the solution is to use Java 8 https://gatk.broadinstitute.org/hc/en-us/community/posts/360056174592-MarkDuplicatesSpark-crash. ## Bibliography * DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D, Daly M. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet, 43:491-498. DOI: 10.1038/ng.806. * Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316–319. 10.1038/nbt.3820 """ ; ns1:isPartOf ; ns1:keywords "Bioinformatics, GATK4, sambamba" ; ns1:license ; ns1:name "TronFlow BAM preprocessing pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-11-03T22:12:24Z"^^ns1:Date ; ns1:dateModified "2023-07-03T10:15:31Z"^^ns1:Date ; ns1:description """This workflow has been created as part of Demonstrator 6 of the project EOSC-Life (within WP3) and is focused on reusing publicly available RNAi screens to gain insights into the nucleolus biology. The workflow downloads images from the Image Data Resource (IDR), performs object segmentation (of nuclei and nucleoli) and feature extraction of the images and objects identified. Tutorial: https://training.galaxyproject.org/training-material/topics/imaging/tutorials/tutorial-CP/tutorial.html""" ; ns1:isBasedOn ; ns1:keywords "CellProfiler, Galaxy, image processing, imaging" ; ns1:license ; ns1:name "Nucleoli segmentation using CellProfiler (EOSC-Life D6)" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer , , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_ens_mean_0_1deg_reg_v20_0e_Paris_daily_csv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ts_cities_csv" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Bin size in bp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Interactions to consider to calculate weights in normalization step" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/No fill-in" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/PE fastq input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Restriction enzyme" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/genome name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/minimum MAPQ" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/region for matrix plotting" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/HiCUP report (html)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/HiCUP report (txt)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/matrix with iced values" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/matrix with raw values" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/plot with pyGenomeTracks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/valid pairs filtered and sorted" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/valid pairs in juicebox format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/valid pairs in juicebox format MAPQ filtered" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-16T02:01:43Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:02:10Z"^^ns1:Date ; ns1:description "This workflow take as input a collection of paired fastq. It uses HiCUP to go from fastq to validPair file. The pairs are filtered for MAPQ and sorted by cooler to generate a tabix dataset. Cooler is used to generate a balanced cool file to the desired resolution." ; ns1:input , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Hi-C" ; ns1:license ; ns1:name "hic-hicup-cooler/hic-fastq-to-cool-hicup-cooler" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "RMD" ; ns1:name "R markdown" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , ; ns1:dateCreated "2023-01-25T07:16:25Z"^^ns1:Date ; ns1:dateModified "2023-02-28T13:04:32Z"^^ns1:Date ; ns1:description "We present an R script that describes the workflow for analysing honey bee (Apis mellifera) wing shape. It is based on a large dataset of wing images and landmark coordinates available at Zenodo: https://doi.org/10.5281/zenodo.7244070. The dataset can be used as a reference for the identification of unknown samples. As unknown samples, we used data from Nawrocka et al. (2018), available at Zenodo: https://doi.org/10.5281/zenodo.7567336. Among others, the script can be used to identify the geographic origin of unknown samples and therefore assist in the monitoring and conservation of honey bee biodiversity in Europe." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.422.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "Apis-wings-EU: A workflow for morphometric identification of honey bees from Europe" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-06-29T14:00:25Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:43:36Z"^^ns1:Date ; ns1:description "The tutorial for this workflow can be found on [Galaxy Training Network](https://training.galaxyproject.org/training-material/topics/climate/tutorials/climate-101/tutorial.html)" ; ns1:image ; ns1:input , ; ns1:keywords "GTN, Climate" ; ns1:license ; ns1:name "Climate - Climate 101" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "workdir_array" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "final_result" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-01-31T23:40:40Z"^^ns1:Date ; ns1:dateModified "2023-12-18T05:36:07Z"^^ns1:Date ; ns1:description """ GermlineStructuralV-nf is a pipeline for identifying structural variant events in human Illumina short read whole genome sequence data. GermlineStructuralV-nf identifies structural variant and copy number events from BAM files using [Manta](https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md#de-novo-calling), [Smoove](https://github.com/brentp/smoove), and [TIDDIT](https://github.com/SciLifeLab/TIDDIT). Variants are then merged using [SURVIVOR](https://github.com/fritzsedlazeck/SURVIVOR), and annotated by [AnnotSV](https://pubmed.ncbi.nlm.nih.gov/29669011/). The pipeline is written in Nextflow and uses Singularity/Docker to run containerised tools.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.431.1" ; ns1:keywords "Bioinformatics, Annotation, Genomics, Nextflow, rare diseases, variant_calling, structural variants, manta, smoove, tiddit, annotsv, survivor" ; ns1:license ; ns1:name "GermlineStructuralV-nf" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-02-02T13:46:59Z"^^ns1:Date ; ns1:dateModified "2023-02-02T13:50:16Z"^^ns1:Date ; ns1:description """# Snakemake workflow: Reconstructing raw tomography data A Snakemake worfklow for tomographically reconstructing raw data using [tomopy](https://tomopy.readthedocs.io/en/stable/). ## Installation First download this repo and navigate to it ```bash git clone https://codebase.helmholtz.cloud/gernha62/reconstructing-raw-tomography-data.git ``` ```bash cd /path/to/repo ``` (Optional) Download the example folder with: ```bash wget -m -np https://doi2.psi.ch/datasets/das/work/p15/p15869/compression/MI04_02/tif ``` Create a virtual environment and install all necessary packages (requires conda): ```bash conda env create --name reconstr_env --file workflow/envs/reconstr.yml ``` Activate the new virtual environment: ```bash conda activate reconstr_env ``` ## Configuration To configure the workflow, adapt the config file found at `config/config.yaml` . The config looks as follows: ```yaml number_of_darks: 50 number_of_flats: 100 number_of_projections: 501 rotation_center: 508.77 raw_data: MI04_02: doi2.psi.ch/datasets/das/work/p15/p15869/compression/MI04_02/tif ``` In the config, adjust `number_of_darks`, `number_of_flats`, `number_of_projections` and `rotation_center` to the number of darks, flats, projections and the rotation center of your dataset. The necessary information can usually be found in the .log file of the folder that contains the raw data. `MI04_02: doi2.psi.ch/datasets/das/work/p15/p15869/compression/MI04_02/tif` denotes the path to the example folder used for reconstruction and the keyword `MI04_02` will be used to name the output (e.g. in this case the output folder will be named `recon_dir_MI04_02`). Replace the examle path with the path to the dataset you want to reconstruct. Additionally, if you want the name of the output folder to have a different suffix, replace the keyword `MI04_02` with a name you prefer. ## Run the workflow If the .tif files contain a numerical prefix that is not separated from the actual image index, it is best to first rename the files. The files will be renamed to `00001.tif`, `00002.tif` and so on. If the renaming is needed, run: ```bash snakemake --cores 1 'logs/renamefile_MI04_02.log' ``` If you replaced the keyword `MI04_02` in the config file then adjust the command accordingly (e.g. if you replaced the keyword with `Tomo_dataset` then the command should be `snakemake --cores 1 'logs/renamefile_Tomo_dataset.log'`). Before trying to compute the reconstructions, make sure you have enough memory available (ideally more than 60 GB). To compute the reconstructions using one core, use the command: ```bash snakemake --cores 1 ``` If you want to use all available cores instead, use: ```bash snakemake --cores all ``` This creates a folder in `results` with the reconstructed data. ## Credit The example dataset used in this project (MI04_02 evolving magma, Mattia Pistone, University of Georgia) was taken from: https://doi.psi.ch/detail/10.16907/05a50450-767f-421d-9832-342b57c201af The script used for reconstruction (`scripts/reconstructs_tomo_datasets.py`) was provided by Alain Studer, PSI.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.432.1" ; ns1:keywords "Reconstruction, Tomography" ; ns1:license ; ns1:name "Reconstructing raw tomography data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-02-02T14:42:11Z"^^ns1:Date ; ns1:dateModified "2023-02-02T14:42:11Z"^^ns1:Date ; ns1:description """# 1. About TF-Prioritizer This pipeline gives you a full analysis of nfcore chromatine accessibility peak data (ChIP-Seq, ATAC-Seq or DNAse-Seq) and nfcore RNA-seq count data. It performs DESeq2, TEPIC and DYNAMITE including all preprocessing and postprocessing steps necessary to transform the data. It also gives you plots for deep analysis of the data. The general workflow is sketched in the images below: ## Graphical abstract: ![Graphical abstrat](https://raw.githubusercontent.com/biomedbigdata/TF-Prioritizer/master/media/graphicalAbstract.png) ## Technical workflow: ![Technical workflow](https://github.com/biomedbigdata/TF-Prioritizer/raw/master/media/technicalWorkflow.png) # 2. License and Citing TF-Prioritizer is distributed under the [GNU General Public License](https://www.gnu.org/licenses/gpl-3.0.en.html). The Graphical Abstract and the Technical Workflow was created using [biorender.com](https://biorender.com/). # 3. Usage The software can be executed using docker. For the following command, only [python3](https://www.python.org/downloads/), [curl](https://curl.se/download.html) and [docker](https://docs.docker.com/get-docker/) are required. Explanations about the configs can be found in the [config readme](https://github.com/biomedbigdata/TF-Prioritizer/blob/master/configTemplates/README.md). ```bash curl -s https://raw.githubusercontent.com/biomedbigdata/TF-Prioritizer/master/docker.py | python3 - -c [config_file] -o [output_dir] -t [threads] ``` Note, that for this approach an internet connection is required. The docker image will be downloaded from [DockerHub](https://hub.docker.com/r/nicotru/tf-prioritizer) on the first execution as well as with every update we release. Furthermore, the wrapper script will be fetched from GitHub with every execution. If curl is not available (for example if you are using windows), or you want to be able to execute the software without an internet connection, you can download the wrapper script from [here](https://raw.githubusercontent.com/biomedbigdata/TF-Prioritizer/pipeJar/docker.py). You can then execute the script using ```bash python3 [script_path] -c [config_file] -o [output_dir] -t [threads] ``` ## If you want to use the pipeline without docker We do not recommend using the pipeline without docker, because the dependencies are very complex, and it is very hard to install them correctly. However, if you want to use the pipeline without docker, you can do so by installing the dependencies manually. The dependencies and their correct installation process can be derived from the [Dockerfile](https://github.com/biomedbigdata/TF-Prioritizer/blob/master/Dockerfile) and the environment scripts which can be found in the [environment directory](https://github.com/biomedbigdata/TF-Prioritizer/tree/master/environment).""" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "TF-Prioritizer" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Other data (fasterq-dump)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pair-end data (fasterq-dump)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Single-end data (fasterq-dump)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fasterq-dump log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rnaviralSPAdes on input dataset(s): Contigs" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-02-10T10:05:10Z"^^ns1:Date ; ns1:dateModified "2023-02-10T10:05:10Z"^^ns1:Date ; ns1:description "extract 1 Id from SRA and assume it is PE as input to viralRNASpades." ; ns1:keywords "" ; ns1:license ; ns1:name "extract SRA + viralRNAspades (PE)" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-02-15T11:54:54Z"^^ns1:Date ; ns1:dateModified "2023-02-15T12:02:21Z"^^ns1:Date ; ns1:description """# sqtlseeker2-nf [![nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.27.0-blue.svg)](http://nextflow.io) [![CI-checks](https://github.com/guigolab/sqtlseeker2-nf/actions/workflows/ci.yaml/badge.svg)](https://github.com/guigolab/sqtlseeker2-nf/actions/workflows/ci.yaml) A pipeline for splicing quantitative trait loci (sQTL) mapping. The pipeline performs the following analysis steps: * Index the genotype file * Preprocess the transcript expression data * Test for association between splicing ratios and genetic variants in *cis* (nominal pass) * Obtain an empirical P-value for each phenotype (permutation pass, optional) * Control for multiple testing For details on each step, please read [sQTLseekeR2](https://github.com/guigolab/sQTLseekeR2) documentation. The pipeline uses [Nextflow](http://www.nextflow.io) as the execution backend. Please check [Nextflow documentation](http://www.nextflow.io/docs/latest/index.html) for more information. ## Requirements - Unix-like operating system (Linux, MacOS, etc.) - Java 8 or later - [Docker](https://www.docker.com/) (v1.10.0 or later) or [Singularity](http://singularity.lbl.gov) (v2.5.0 or later) ## Quickstart (~2 min) 1. Install Nextflow: ``` curl -fsSL get.nextflow.io | bash ``` 2. Make a test run: ``` ./nextflow run guigolab/sqtlseeker2-nf -with-docker ``` **Note**: set `-with-singularity` to use Singularity instead of Docker. ## Pipeline usage Launching the pipeline with the `--help` parameter shows the help message: ``` nextflow run sqtlseeker2-nf --help ``` ``` N E X T F L O W ~ version 0.27.2 Launching `sqtlseeker2.nf` [admiring_lichterman] - revision: 28c86caf1c sqtlseeker2-nf ~ A pipeline for splicing QTL mapping ---------------------------------------------------- Run sQTLseekeR2 on a set of data. Usage: sqtlseeker2-nf [options] Options: --genotype GENOTYPE_FILE the genotype file --trexp EXPRESSION_FILE the transcript expression file --metadata METADATA_FILE the metadata file --genes GENES_FILE the gene location file --dir DIRECTORY the output directory --mode MODE the run mode: nominal or permuted (default: nominal) --win WINDOW the cis window in bp (default: 5000) --covariates COVARIATES include covariates in the model (default: false) --fdr FDR false discovery rate level (default: 0.05) --min_md MIN_MD minimum effect size reported (default: 0.05) --svqtl SVQTLS report svQTLs (default: false) Additional parameters for mode = nominal: --ld LD threshold for LD-based variant clustering (default: 0, no clustering) --kn KN number of genes per batch in nominal pass (default: 10) Additional parameters for mode = permuted: --kp KP number of genes per batch in permuted pass (default: 10) --max_perm MAX_PERM maximum number of permutations (default: 1000) ``` ## Input files and format `sqtlseeker2-nf` takes as input files the following: * **Genotype file.** Contains the genotype of each sample, coded as follows: 0 for REF/REF, 1 for REF/ALT, 2 for ALT/ALT, -1 for missing value. The first four columns should be: `chr`, `start`, `end` and `snpId`. This file needs to be sorted by coordinate. * **Transcript expression file.** Contains the expression of each transcript in each sample (e.g. read counts, RPKM, TPM). It is not recommended to use transformed (log, quantile, or any non-linear transformation) expression. Columns `trId` and `geneId`, corresponding to the transcript and gene IDs, are required. * **Metadata file.** Contains the covariate information for each sample. In addition, it defines the groups or conditions for which sQTL mapping will be performed. The first columns should be: `indId`, `sampleId`, `group`, followed by the covariates. This file defines which samples will be tested. * **Gene location file.** Contains the location of each gene. Columns `chr`, `start`, `end` and `geneId` are required. This file defines which genes will be tested. Example [data](data) is available for the test run. ## Pipeline results sQTL mapping results are saved into the folder specified with the `--dir` parameter. By default it is the `result` folder within the current working directory. Output files are organinzed into subfolders corresponding to the different `groups` specified in the metadata file: ``` result └── groups ├── group1 │   ├── all-tests.nominal.tsv │   ├── all-tests.permuted.tsv │   ├── sqtls-${level}fdr.nominal.tsv │   └── sqtls-${level}fdr.permuted.tsv ├── group2 ... ``` Note: if only a nominal pass was run, files `*.permuted.tsv` will not be present. Output files contain the following information: `all-tests.nominal.tsv` * geneId: gene name * snpId: variant name * F: test statistic * nb.groups: number of genotype groups * md: maximum difference in relative expression between genotype groups (sQTL effect size) * tr.first/tr.second: the transcript IDs of the two transcripts that change the most, in opposite directions * info: number of individuals in each genotype group, including missing values (-1,0,1,2) * pv: nominal P-value if `--svqtl true` * F.svQTL: svQTL test statistic * nb.perms.svQTL: number of permutations for svQTL test * pv.svQTL: svQTL nominal P-value if `--ld ${r2}` * LD: other variants in linkage disequilibrium with snpId above a given r2 threshold > 0 `sqtls-${level}fdr.nominal.tsv` (in addition to the previous) * fdr: false discovery rate (computed across all nominal tests) * fdr.svQTL: svQTL FDR `all-tests.permuted.tsv` * geneId: gene name * variants.cis: number of variants tested in *cis* * LD: median linkage disequilibrium in the region (r2) * best.snp: ID of the top variant * best.nominal.pv: P-value of the top variant * shape1: first parameter value of the fitted beta distribution * shape2: second parameter value of the fitted beta distribution (effective number of independent tests in the region) * nb.perm: number of permutations * pv.emp.perm: empirical P-value, computed based on permutations * pv.emp.beta: empirical P-value, computed based on the fitted beta distribution * runtime: run time in minutes `sqtls-${level}fdr.nominal.tsv` (in addition to the previous) * fdr: false discovery rate (computed across empirical P-values) * p_tn: gene-level threshold for nominal P-values ## Cite sqtlseeker2-nf If you find `sqtlseeker2-nf` useful in your research please cite the related publication: Garrido-Martín, D., Borsari, B., Calvo, M., Reverter, F., Guigó, R. Identification and analysis of splicing quantitative trait loci across multiple tissues in the human genome. *Nat Commun* 12, 727 (2021). [https://doi.org/10.1038/s41467-020-20578-2](https://doi.org/10.1038/s41467-020-20578-2) """ ; ns1:keywords "QTL mapping, rna-seq, SNPs, Nextflow, Alternative splicing" ; ns1:license ; ns1:name "sqtlseeker2-nf" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-02-15T11:58:32Z"^^ns1:Date ; ns1:dateModified "2023-02-15T12:09:57Z"^^ns1:Date ; ns1:description """# mvgwas-nf [![nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.1-blue.svg)](http://nextflow.io) [![CI-checks](https://github.com/guigolab/sqtlseeker2-nf/actions/workflows/ci.yaml/badge.svg)](https://github.com/guigolab/sqtlseeker2-nf/actions/workflows/ci.yaml) A pipeline for multi-trait genome-wide association studies (GWAS) using [MANTA](https://github.com/dgarrimar/manta). The pipeline performs the following analysis steps: * Split genotype file * Preprocess phenotype and covariate data * Test for association between phenotypes and genetic variants * Collect summary statistics The pipeline uses [Nextflow](http://www.nextflow.io) as the execution backend. Please check [Nextflow documentation](http://www.nextflow.io/docs/latest/index.html) for more information. ## Requirements - Unix-like operating system (Linux, MacOS, etc.) - Java 8 or later - [Docker](https://www.docker.com/) (v1.10.0 or later) or [Singularity](http://singularity.lbl.gov) (v2.5.0 or later) ## Quickstart (~2 min) 1. Install Nextflow: ``` curl -fsSL get.nextflow.io | bash ``` 2. Make a test run: ``` nextflow run dgarrimar/mvgwas-nf -with-docker ``` **Notes**: move the `nextflow` executable to a directory in your `$PATH`. Set `-with-singularity` to use Singularity instead of Docker. (*) Alternatively you can clone this repository: ``` git clone https://github.com/dgarrimar/mvgwas-nf cd mvgwas-nf nextflow run mvgwas.nf -with-docker ``` ## Pipeline usage Launching the pipeline with the `--help` parameter shows the help message: ``` nextflow run mvgwas.nf --help ``` ``` N E X T F L O W ~ version 20.04.1 Launching `mvgwas.nf` [amazing_roentgen] - revision: 56125073b7 mvgwas-nf: A pipeline for multivariate Genome-Wide Association Studies ============================================================================================== Performs multi-trait GWAS using using MANTA (https://github.com/dgarrimar/manta) Usage: nextflow run mvgwas.nf [options] Parameters: --pheno PHENOTYPES phenotype file --geno GENOTYPES indexed genotype VCF file --cov COVARIATES covariate file --l VARIANTS/CHUNK variants tested per chunk (default: 10000) --t TRANSFOMATION phenotype transformation: none, sqrt, log (default: none) --i INTERACTION test for interaction with a covariate: none, (default: none) --ng INDIVIDUALS/GENOTYPE minimum number of individuals per genotype group (default: 10) --dir DIRECTORY output directory (default: result) --out OUTPUT output file (default: mvgwas.tsv) ``` ## Input files and format `mvgwas-nf` requires the following input files: * **Genotypes.** [bgzip](http://www.htslib.org/doc/bgzip.html)-compressed and indexed [VCF](https://samtools.github.io/hts-specs/VCFv4.3.pdf) genotype file. * **Phenotypes.** Tab-separated file with phenotype measurements (quantitative) for each sample (i.e. *n* samples x *q* phenotypes). The first column should contain sample IDs. Columns should be named. * **Covariates.** Tab-separated file with covariate measurements (quantitative or categorical) for each sample (i.e. *n* samples x *k* covariates). The first column should contain sample IDs. Columns should be named. Example [data](data) is available for the test run. ## Pipeline results An output text file containing the multi-trait GWAS summary statistics (default: `./result/mvgwas.tsv`), with the following information: * `CHR`: chromosome * `POS`: position * `ID`: variant ID * `REF`: reference allele * `ALT`: alternative allele * `F`: pseudo-F statistic * `R2`: fraction of variance explained by the variant * `P`: P-value The output folder and file names can be modified with the `--dir` and `--out` parameters, respectively. ## Cite mvgwas-nf If you find `mvgwas-nf` useful in your research please cite the related publication: Garrido-Martín, D., Calvo, M., Reverter, F., Guigó, R. A fast non-parametric test of association for multiple traits. *bioRxiv* (2022). [https://doi.org/10.1101/2022.06.06.493041](https://doi.org/10.1101/2022.06.06.493041) """ ; ns1:keywords "GWAS, Multivariate, Non-parametric, Nextflow" ; ns1:license ; ns1:name "mvgwas-nf" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ang Guo" . a ns1:Person ; ns1:name "Qian Luo" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-02-16T00:34:18Z"^^ns1:Date ; ns1:dateModified "2023-03-08T23:57:45Z"^^ns1:Date ; ns1:description """# ROIforMSI Source codes for manuscript "Delineating Regions-of-interest for Mass Spectrometry Imaging by Multimodally Corroborated Spatial Segmentation" "ExampleWorkflow.ipynb" is a methods document to demonstrate the workflow of our multimodal fusion-based spatial segmentation. "Utilities.py" contains all the tools to implement our method. "gui.py" and "registration_gui.py" are files to implement linear and nonlinear registration. (Licence: GPL-3)""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.437.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "Delineating Regions-of-interest for Mass Spectrometry Imaging by Multimodally Corroborated Spatial Segmentation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-02-20T15:48:34Z"^^ns1:Date ; ns1:dateModified "2023-02-20T15:48:34Z"^^ns1:Date ; ns1:description """# MoP2- DSL2 version of Master of Pores [![Docker Build Status](https://img.shields.io/docker/automated/biocorecrg/nanopore.svg)](https://cloud.docker.com/u/biocorecrg/repository/docker/biocorecrg/nanopore/builds) [![mop2-CI](https://github.com/biocorecrg/MoP2/actions/workflows/build.yml/badge.svg)](https://github.com/biocorecrg/MoP2/actions/workflows/build.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Nextflow version](https://img.shields.io/badge/Nextflow-21.04.1-brightgreen)](https://www.nextflow.io/) [![Nextflow DSL2](https://img.shields.io/badge/Nextflow-DSL2-brightgreen)](https://www.nextflow.io/) [![Singularity version](https://img.shields.io/badge/Singularity-v3.2.1-green.svg)](https://www.sylabs.io/) [![Docker version](https://img.shields.io/badge/Docker-v20.10.8-blue)](https://www.docker.com/)
![MOP2](https://github.com/biocorecrg/MoP2/blob/main/img/master_red.jpg?raw=true) Inspired by Metallica's [Master Of Puppets](https://www.youtube.com/watch?v=S7blkui3nQc) ## Install Please install nextflow and singularity or docker before. Then download the repo: ``` git clone --depth 1 --recurse-submodules git@github.com:biocorecrg/MOP2.git ``` You can use INSTALL.sh to download the version 3.4.5 of guppy or you can replace it with the version you prefer. Please consider that the support of VBZ compression of fast5 started with version 3.4.X. ``` cd MoP2; sh INSTALL.sh ``` ## Testing You can replace ```-with-singularity``` with ```-with-docker``` if you want to use the docker engine. ``` cd mop_preprocess nextflow run mop_preprocess.nf -with-singularity -bg -profile local > log ``` ## Reference If you use this tool, please cite our papers: ["Nanopore Direct RNA Sequencing Data Processing and Analysis Using MasterOfPores" Cozzuto L, Delgado-Tejedor A, Hermoso Pulido T, Novoa EM, Ponomarenko J. *N. Methods Mol Biol. 2023*;2624:185-205. doi: 10.1007/978-1-0716-2962-8_13.](https://link.springer.com/protocol/10.1007/978-1-0716-2962-8_13) ["MasterOfPores: A Workflow for the Analysis of Oxford Nanopore Direct RNA Sequencing Datasets" Luca Cozzuto, Huanle Liu, Leszek P. Pryszcz, Toni Hermoso Pulido, Anna Delgado-Tejedor, Julia Ponomarenko, Eva Maria Novoa. *Front. Genet., 17 March 2020.* https://doi.org/10.3389/fgene.2020.00211](https://www.frontiersin.org/articles/10.3389/fgene.2020.00211/full) ## Documentation The documentation is available at [https://biocorecrg.github.io/MOP2/docs/](https://biocorecrg.github.io/MOP2/docs/about.html) """ ; ns1:keywords "nanopore, ONT, dRNAseq, Transcriptomics, metatranscriptomics" ; ns1:license ; ns1:name "Master of Pores 2" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Allele frequency to call SNV" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Allele frequency to call indel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum quality score to call base" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "PE Reads Pool1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "PE Reads Pool2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Primer Scheme" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference FASTA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mapping of Pool1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_consensus_multifasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool1_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool1_json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool1_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool2_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool2_json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastp_pool2_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_mapping_pool1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filtered_mapping_pool2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input dataset(s) (sorted)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ivar_consensus_out" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapping_merged" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapping_pool2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapping_stats_pool1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mapping_stats_pool2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "masked_ref_pool1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "masked_ref_pool2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_pool1_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_pool2_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc_sample_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "per_sample_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pool1_primers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pool1_quality_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pool2_primers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "pool2_quality_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qualimap_merged_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qualimap_pool1_raw" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "quality_by_sample_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmed_merged_mapping" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-10-11T02:01:48Z"^^ns1:Date ; ns1:dateModified "2025-11-27T14:10:00Z"^^ns1:Date ; ns1:description "A workflow for the analysis of pox virus genomes sequenced as half-genomes (for ITR resolution) in a tiled-amplicon approach" ; ns1:input , , , , , , ; ns1:isBasedOn ; ns1:keywords "pox, Virology" ; ns1:license ; ns1:name "pox-virus-amplicon/main" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-07-22T10:49:00Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:43:41Z"^^ns1:Date ; ns1:description """CWL workflow for NMR spectra Peak Picking The workflow takes as input a series of 2D 1H 15N HSQC NMR spectra and uses nmrpipe tools to convert the spectra in nmrpipe format and performs an automatic peak picking. This test uses a protein MDM2 with different ligands and peptide and generates a peak list with 1H and 15N chemical shift values for each spectrum. The difference among these peak lists can be used to characterize the ligand binding site on the protein.""" ; ns1:image ; ns1:input ; ns1:keywords "" ; ns1:license ; ns1:name "NMR pipe" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Harshil Patel" . a ns1:Person ; ns1:name "Phil Ewels" . a ns1:Person ; ns1:name "Rickard Hammarén" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-03-21T05:17:30Z"^^ns1:Date ; ns1:dateModified "2023-03-21T05:17:30Z"^^ns1:Date ; ns1:description """# IGVreport-nf - [Description](#description) - [Diagram](#diagram) - [User guide](#user-guide) - [Workflow summaries](#workflow-summaries) - [Metadata](#metadata) - [Component tools](#component-tools) - [Required (minimum) inputs/parameters](#required-minimum-inputsparameters) - [Additional notes](#additional-notes) - [Help/FAQ/Troubleshooting](#helpfaqtroubleshooting) - [Acknowledgements/citations/credits](#acknowledgementscitationscredits) ## Description Quickly generate [IGV `.html` reports](https://github.com/igvteam/igv-reports) for a genomic region of interest in the human genome (hg38). Bcftools is used to subset a VCF to a region of interest, the subset VCF is then passed to IGV-reports, which generates a report consisting of a table of genomic sites or regions and associated IGV views for each site. The reports can be opened by any web browser as a static page. ### Diagram ```mermaid graph LR; VCF-->|bcftools view|SubsetVCF; SubsetVCF-->|IGVtools|HTMLreport; AlignmentBAM-->|IGVtools|HTMLreport; ``` ### User guide This workflow uses containers for all steps and can run using Singularity or Docker. It requires Nextflow and either Singularity or Docker be installed. For instructions on installing Nextflow, see their [documentation](https://www.nextflow.io/docs/latest/getstarted.html). **This workflow currently only generates reports for the human reference genome assembly, Hg38.** The workflow runs three processes: 1. The provided VCF file is subset to a region of interest using Bcftools view 2. The Subset VCF file is then indexed using Bcftools index 3. The subset VCF and provided Bam file are used to generate the html report for the region of interest. To start clone this repository: ``` git clone https://github.com/Sydney-Informatics-Hub/IGVreport-nf.git ``` From the IGVreport-nf directory, run the pipeline: ``` nextflow run main.nf --sample \\ --bam \\ --vcf \\ --chr --start --stop ``` This will create a report in a directory titled `./Report`. You can rename this directory at runtime using the flag `--outDir`. All runtime summary reports will be available in the `./runInfo` directory. ### Workflow summaries #### Metadata |metadata field | workflow_name / workflow_version | |-------------------|:---------------------------------:| |Version | 1.0 | |Maturity | under development | |Creators | Georgie Samaha | |Source | NA | |License | GPL-3.0 license | |Workflow manager | NextFlow | |Container | None | |Install method | NA | |GitHub | github.com/Sydney-Informatics-Hub/IGVreport-nf | |bio.tools | NA | |BioContainers | NA | |bioconda | NA | #### Component tools * nextflow>=20.07.1 * singularity or docker * bcftools/1.16 * igv-reports/1.6.1 #### Required (minimum) inputs/parameters * An indexed alignment file in Bam format * A gzipped and indexed vcf file ## Additional notes ## Help/FAQ/troubleshooting ## Acknowledgements/citations/credits This workflow was developed by the Sydney Informatics Hub, a Core Research Facility of the University of Sydney and the Australian BioCommons which is enabled by NCRIS via Bioplatforms Australia. """ ; ns1:keywords "Alignment, Genomics, variant calling, mapping" ; ns1:license ; ns1:name "IGVreport-nf" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-03-21T05:30:15Z"^^ns1:Date ; ns1:dateModified "2023-03-21T05:30:15Z"^^ns1:Date ; ns1:description """# SNP-Calling GATK Variant calling pipeline for genomic data using Nextflow [![nextflow](https://img.shields.io/badge/nextflow-%E2%89%A522.04.5-brightgreen.svg)](http://nextflow.io) ## Quickstart Install Nextflow using the following command: curl -s https://get.nextflow.io | bash Index reference genome: `$ bwa index /path/to/reference/genome.fa` `$ samtools faidx /path/to/reference/genome.fa` `$ gatk CreateSequenceDictionary -R /path/to/genome.fa -O genome.dict` Launch the pipeline execution with the following command: nextflow run jdetras/snp-calling -r main -profile docker ## Pipeline Description The variant calling pipeline follows the recommended practices from GATK. The input genomic data are aligned to a reference genome using BWA. The alignemnt files are processed using Picard Tools. Variant calling is done using samtools and GATK. ## Input files The input files required to run the pipeline: * Genomic sequence paired reads, `*_{1,2}.fq.gz` * Reference genome, `*.fa` ## Pipeline parameters ### Usage Usage: `nextflow run jdetras/snp-calling -profile docker [options]` Options: * `--reads` * `--genome` * `--output` Example: `$ nextflow run jdetras/snp-calling -profile docker --reads '/path/to/reads/*_{1,2}.fq.gz' --genome '/path/to/reference/genome.fa' --output '/path/to/output'` #### `--reads` * The path to the FASTQ read files. * Wildcards (*, ?) can be used to declare multiple reads. Use single quotes when wildcards are used. * Default parameter: `$projectDir/data/reads/*_{1,2}.fq.gz` Example: `$ nextflow run jdetras/snp-calling -profile docker --reads '/path/to/reads/*_{1,2}.fq.gz'` #### `--genome` * The path to the genome file in fasta format. * The extension is `.fa`. * Default parameter: `$projectDir/data/reference/genome.fa` Example: `$ nextflow run jdetras/snp-calling -profile docker --genome /path/to/reference/genome.fa` #### `--output` * The path to the directory for the output files. * Default parameter: `$projectDir/output` ## Software * [BWA 0.7.17](http://bio-bwa.sourceforge.net/) * [Samtools 1.3.1](http://www.htslib.org/) * [GATK 4.2.6.1](https://gatk.broadinstitute.org/) """ ; ns1:keywords "variant calling, GATK4, BWA-mem, rice" ; ns1:license ; ns1:name "SNP-Calling Workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-03-21T05:48:16Z"^^ns1:Date ; ns1:dateModified "2024-10-24T15:31:14Z"^^ns1:Date ; ns1:description """# GRAVI: Gene Regulatory Analysis using Variable Inputs This is a `snakemake` workflow for: 1. Performing sample QC 2. Calling ChIP peaks 3. Performing Differential Binding Analysis 4. Comparing results across ChIP targets The minimum required input is one ChIP target with two conditions. Full documentation can be found [here](https://steveped.github.io/GRAVI/) ## Snakemake Implementation The basic workflow is written `snakemake`, requiring at least v7.7, and can be called using the following steps. Firstly, setup the required conda environments ``` snakemake \\ --use-conda \\ --conda-prefix '/home/steveped/mambaforge/envs/' \\ --conda-create-envs-only \\ --cores 1 ``` Secondly, create and inspect the rulegraph ``` snakemake --rulegraph > workflow/rules/rulegraph.dot dot -Tpdf workflow/rules/rulegraph.dot > workflow/rules/rulegraph.pdf ``` Finally, the workflow itself can be run using: ``` snakemake \\ -p \\ --use-conda \\ --conda-prefix '/home/steveped/mambaforge/envs/' \\ --notemp \\ --rerun-triggers mtime \\ --keep-going \\ --cores 16 ``` Note that this creates common environments able to be called by other workflows and is dependent on the user. For me, my global conda environments are stored in `/home/steveped/mambaforge/envs/`. For other users, this path will need to be modified. If wishing to tidy the directory after a successful run, you can check which non-essential files can be deleted using `snakemake -n --delete-temp-output --cores 1`. If the files earmarked for deletion are considered to be non-essential, they can be deleted by removing the `-n` flag from the above code: `snakemake --delete-temp-output --cores 1`. As the bedgraph files produced by `macs2 callpeak` are typically very large, hence their conversion to bigwig files during the workflow, this step can free a considerable amount of disk space. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.443.1" ; ns1:keywords "ChIP-seq, BAM, Bioinformatics" ; ns1:license ; ns1:name "GRAVI: Gene Regulatory Analysis using Variable Inputs" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Magnus Palmblad" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-09-01T09:01:32Z"^^ns1:Date ; ns1:dateModified "2025-06-04T13:58:44Z"^^ns1:Date ; ns1:description """## Introduction **wombat-p pipelines** is a bioinformatics analysis pipeline that bundles different workflow for the analysis of label-free proteomics data with the purpose of comparison and benchmarking. It allows using files from the [proteomics metadata standard SDRF](https://github.com/bigbio/proteomics-metadata-standard). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. We used one of the [nf-core](https://nf-co.re/) templates. ## Pipeline summary This work contains four major different workflows for the analysis or label-free proteomics data, originating from LC-MS experiments. 1. [MaxQuant](https://www.maxquant.org/) + [NormalyzerDE](https://normalyzerde.immunoprot.lth.se/) 2. [SearchGui](http://compomics.github.io/projects/searchgui) + [Proline](https://www.profiproteomics.fr/proline/) + [PolySTest](https://bitbucket.org/veitveit/polystest) 3. [Compomics tools](http://compomics.github.io/) + [FlashLFQ](https://github.com/smith-chem-wisc/FlashLFQ) + [MSqRob](https://github.com/statOmics/MSqRob) 4. Tools from the [Trans-Proteomic Pipeline](http://tools.proteomecenter.org/TPP.php) + [ROTS](https://bioconductor.org/packages/release/bioc/html/ROTS.html) Initialization and parameterization of the workflows is based on tools from the [SDRF pipelines](https://github.com/bigbio/sdrf-pipelines), the [ThermoRawFileParser](http://compomics.github.io/projects/ThermoRawFileParser) with our own contributions and additional programs from the wombat-p organizaion [https://github.com/wombat-p/Utilities] as well as our [fork](https://github.com/elixir-proteomics-community/sdrf-pipelines). This includes setting a generalized set of data analysis parameters and the calculation of a multiple benchmarks. ## Credits nf-core/wombat was originally written by the members of the ELIXIR Implementation study [Comparison, benchmarking and dissemination of proteomics data analysis pipelines](https://elixir-europe.org/internal-projects/commissioned-services/proteomics-pipelines) under the lead of Veit Schwämmle and major participation of David Bouyssié and Fredrik Levander. ## Citations Manuscript in preparation As the workflows are using an nf-core template, we refer to the publication: > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.444.2" ; ns1:isBasedOn ; ns1:keywords "Proteomics, MaxQuant, Proline, Compomics, Trans-Proteomic" ; ns1:license ; ns1:name "WOMBAT-Pipelines" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-03-21T13:07:16Z"^^ns1:Date ; ns1:dateModified "2023-05-12T15:33:47Z"^^ns1:Date ; ns1:description """[![ci](https://github.com/zavolanlab/zarp/workflows/CI/badge.svg?branch=dev)](https://github.com/zavolanlab/zarp/actions?query=workflow%3Aci) [![GitHub license](https://img.shields.io/github/license/zavolanlab/zarp?color=orange)](https://github.com/zavolanlab/zarp/blob/dev/LICENSE) [![DOI:10.1101/2021.11.18.469017](http://img.shields.io/badge/DOI-10.1101/2021.11.18.469017-B31B1B.svg)](https://doi.org/10.1101/2021.11.18.469017)
# **ZARP** ([Zavolan-Lab](https://www.biozentrum.unibas.ch/research/researchgroups/overview/unit/zavolan/research-group-mihaela-zavolan/) Automated RNA-Seq Pipeline) ...is a generic RNA-Seq analysis workflow that allows users to process and analyze Illumina short-read sequencing libraries with minimum effort. The workflow relies on publicly available bioinformatics tools and currently handles single or paired-end stranded bulk RNA-seq data. The workflow is developed in [Snakemake](https://snakemake.readthedocs.io/en/stable/), a widely used workflow management system in the bioinformatics community. According to the current ZARP implementation, reads are analyzed (pre-processed, aligned, quantified) with state-of-the-art tools to give meaningful initial insights into the quality and composition of an RNA-Seq library, reducing hands-on time for bioinformaticians and giving experimentalists the possibility to rapidly assess their data. Additional reports summarise the results of the individual steps and provide useful visualisations. > **Note:** For a more detailed description of each step, please refer to the [workflow > documentation](https://github.com/zavolanlab/zarp/blob/main/pipeline_documentation.md). ## Requirements The workflow has been tested on: - CentOS 7.5 - Debian 10 - Ubuntu 16.04, 18.04 > **NOTE:** > Currently, we only support **Linux** execution. # Installation ## 1. Clone the repository Go to the desired directory/folder on your file system, then clone/get the repository and move into the respective directory with: ```bash git clone https://github.com/zavolanlab/zarp.git cd zarp ``` ## 2. Conda and Mamba installation Workflow dependencies can be conveniently installed with the [Conda](http://docs.conda.io/projects/conda/en/latest/index.html) package manager. We recommend that you install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for your system (Linux). Be sure to select Python 3 option. The workflow was built and tested with `miniconda 4.7.12`. Other versions are not guaranteed to work as expected. Given that Miniconda has been installed and is available in the current shell the first dependency for ZARP is the [Mamba](https://github.com/mamba-org/mamba) package manager, which needs to be installed in the `base` conda environment with: ```bash conda install mamba -n base -c conda-forge ``` ## 3. Dependencies installation For improved reproducibility and reusability of the workflow, each individual step of the workflow runs either in its own [Singularity](https://sylabs.io/singularity/) container or in its own [Conda](http://docs.conda.io/projects/conda/en/latest/index.html) virtual environemnt. As a consequence, running this workflow has very few individual dependencies. The **container execution** requires Singularity to be installed on the system where the workflow is executed. As the functional installation of Singularity requires root privileges, and Conda currently only provides Singularity for Linux architectures, the installation instructions are slightly different depending on your system/setup: ### For most users If you do *not* have root privileges on the machine you want to run the workflow on *or* if you do not have a Linux machine, please [install Singularity](https://sylabs.io/guides/3.5/admin-guide/installation.html) separately and in privileged mode, depending on your system. You may have to ask an authorized person (e.g., a systems administrator) to do that. This will almost certainly be required if you want to run the workflow on a high-performance computing (HPC) cluster. > **NOTE:** > The workflow has been tested with the following Singularity versions: > * `v2.6.2` > * `v3.5.2` After installing Singularity, install the remaining dependencies with: ```bash mamba env create -f install/environment.yml ``` ### As root user on Linux If you have a Linux machine, as well as root privileges, (e.g., if you plan to run the workflow on your own computer), you can execute the following command to include Singularity in the Conda environment: ```bash mamba env update -f install/environment.root.yml ``` ## 4. Activate environment Activate the Conda environment with: ```bash conda activate zarp ``` # Extra installation steps (optional) ## 5. Non-essential dependencies installation Most tests have additional dependencies. If you are planning to run tests, you will need to install these by executing the following command _in your active Conda environment_: ```bash mamba env update -f install/environment.dev.yml ``` ## 6. Successful installation tests We have prepared several tests to check the integrity of the workflow and its components. These can be found in subdirectories of the `tests/` directory. The most critical of these tests enable you to execute the entire workflow on a set of small example input files. Note that for this and other tests to complete successfully, [additional dependencies](#installing-non-essential-dependencies) need to be installed. Execute one of the following commands to run the test workflow on your local machine: * Test workflow on local machine with **Singularity**: ```bash bash tests/test_integration_workflow/test.local.sh ``` * Test workflow on local machine with **Conda**: ```bash bash tests/test_integration_workflow_with_conda/test.local.sh ``` Execute one of the following commands to run the test workflow on a [Slurm](https://slurm.schedmd.com/documentation.html)-managed high-performance computing (HPC) cluster: * Test workflow with **Singularity**: ```bash bash tests/test_integration_workflow/test.slurm.sh ``` * Test workflow with **Conda**: ```bash bash tests/test_integration_workflow_with_conda/test.slurm.sh ``` > **NOTE:** Depending on the configuration of your Slurm installation you may > need to adapt file `slurm-config.json` (located directly under `profiles` > directory) and the arguments to options `--cores` and `--jobs` > in the file `config.yaml` of a respective profile. > Consult the manual of your workload manager as well as the section of the > Snakemake manual dealing with [profiles]. # Running the workflow on your own samples 1. Assuming that your current directory is the repository's root directory, create a directory for your workflow run and move into it with: ```bash mkdir config/my_run cd config/my_run ``` 2. Create an empty sample table and a workflow configuration file: ```bash touch samples.tsv touch config.yaml ``` 3. Use your editor of choice to populate these files with appropriate values. Have a look at the examples in the `tests/` directory to see what the files should look like, specifically: - [samples.tsv](https://github.com/zavolanlab/zarp/blob/main/tests/input_files/samples.tsv) - [config.yaml](https://github.com/zavolanlab/zarp/blob/main/tests/input_files/config.yaml) - For more details and explanations, refer to the [pipeline-documentation](https://github.com/zavolanlab/zarp/blob/main/pipeline_documentation.md) 4. Create a runner script. Pick one of the following choices for either local or cluster execution. Before execution of the respective command, you need to remember to update the argument of the `--singularity-args` option of a respective profile (file: `profiles/{profile}/config.yaml`) so that it contains a comma-separated list of _all_ directories containing input data files (samples and any annotation files etc) required for your run. Runner script for _local execution_: ```bash cat << "EOF" > run.sh #!/bin/bash snakemake \\ --profile="../../profiles/local-singularity" \\ --configfile="config.yaml" EOF ``` **OR** Runner script for _Slurm cluster exection_ (note that you may need to modify the arguments to `--jobs` and `--cores` in the file: `profiles/slurm-singularity/config.yaml` depending on your HPC and workload manager configuration): ```bash cat << "EOF" > run.sh #!/bin/bash mkdir -p logs/cluster_log snakemake \\ --profile="../profiles/slurm-singularity" \\ --configfile="config.yaml" EOF ``` When running the pipeline with *conda* you should use `local-conda` and `slurm-conda` profiles instead. 5. Start your workflow run: ```bash bash run.sh ``` # Sample downloads from SRA An independent Snakemake workflow `workflow/rules/sra_download.smk` is included for the download of SRA samples with [sra-tools]. > Note: as of Snakemake 7.3.1, only profile conda is supported. > Singularity fails because the *sra-tools* Docker container only has `sh` but `bash` is required. > Note: The workflow uses the implicit temporary directory from snakemake, which is called with [resources.tmpdir]. The workflow expects the following config: * `samples`, a sample table (tsv) with column *sample* containing *SRR* identifiers, see example [here](https://github.com/zavolanlab/zarp/blob/main/tests/input_files/sra_samples.tsv). * `outdir`, an output directory * `samples_out`, a pointer to a modified sample table with location of fastq files * `cluster_log_dir`, the cluster log directory. For executing the example one can use the following (with activated *zarp* environment): ```bash snakemake --snakefile="workflow/rules/sra_download.smk" \\ --profile="profiles/local-conda" \\ --config samples="tests/input_files/sra_samples.tsv" \\ outdir="results/sra_downloads" \\ samples_out="results/sra_downloads/sra_samples.out.tsv" \\ log_dir="logs" \\ cluster_log_dir="logs/cluster_log" ``` After successful execution, `results/sra_downloads/sra_samples.out.tsv` should contain: ```tsv sample fq1 fq2 SRR18552868 results/sra_downloads/SRR18552868/SRR18552868.fastq.gz SRR18549672 results/sra_downloads/SRR18549672/SRR18549672_1.fastq.gz results/sra_downloads/SRR18549672/SRR18549672_2.fastq.gz ``` # Metadata completion with HTSinfer An independent Snakemake workflow `workflow/rules/htsinfer.smk` that populates the `samples.tsv` required by ZARP with the sample specific parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size`. Those parameters are inferred from the provided `fastq.gz` files by [HTSinfer](https://github.com/zavolanlab/htsinfer). > Note: The workflow uses the implicit temporary directory from snakemake, which is called with [resources.tmpdir]. The workflow expects the following config: * `samples`, a sample table (tsv) with column *sample* containing sample identifiers, as well as columns *fq1* and *fq2* containing the paths to the input fastq files see example [here](https://github.com/zavolanlab/zarp/blob/main/tests/input_files/sra_samples.tsv). If the table contains further ZARP compatible columns (see [pipeline documentation](https://github.com/zavolanlab/zarp/blob/main/pipeline_documentation.md#read-sample-table)), the values specified there by the user are given priority over htsinfer's results. * `outdir`, an output directory * `samples_out`, path to a modified sample table with inferred parameters * `records`, set to 100000 per default For executing the example one can use the following (with activated *zarp* environment): ```bash cd tests/test_htsinfer_workflow snakemake \\ --snakefile="../../workflow/rules/htsinfer.smk" \\ --restart-times=0 \\ --profile="../../profiles/local-singularity" \\ --config outdir="results" \\ samples="../input_files/htsinfer_samples.tsv" \\ samples_out="samples_htsinfer.tsv" \\ --notemp \\ --keep-incomplete ``` However, this call will exit with an error, as not all parameters can be inferred from the example files. The argument `--keep-incomplete` makes sure the `samples_htsinfer.tsv` file can nevertheless be inspected. After successful execution - if all parameters could be either inferred or were specified by the user - `[OUTDIR]/[SAMPLES_OUT]` should contain a populated table with parameters `seqmode`, `f1_3p`, `f2_3p`, `organism`, `libtype` and `index_size` for all input samples as described in the [pipeline documentation](https://github.com/zavolanlab/zarp/blob/main/pipeline_documentation.md#read-sample-table). """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.447.1" ; ns1:image ; ns1:keywords "Bioinformatics, rna, rna-seq, RNASEQ, NGS, high-throughput" ; ns1:license ; ns1:name "ZARP: An automated workflow for processing of RNA-seq data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2025-09-03T02:04:50Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:02:10Z"^^ns1:Date ; ns1:description """

nf-core/rnaseq

[![GitHub Actions CI Status](https://github.com/nf-core/rnaseq/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/rnaseq/actions/workflows/nf-test.yml) [![GitHub Actions Linting Status](https://github.com/nf-core/rnaseq/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/rnaseq/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/rnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.1400710-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.1400710) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) [![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/rnaseq) [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23rnaseq-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/rnaseq)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction **nf-core/rnaseq** is a bioinformatics pipeline that can be used to analyse RNA sequencing data obtained from organisms with a reference genome and annotation. It takes a samplesheet and FASTQ files as input, performs quality control (QC), trimming and (pseudo-)alignment, and produces a gene expression matrix and extensive QC report. ![nf-core/rnaseq metro map](docs/images/nf-core-rnaseq_metro_map_grey_animated.svg) > In case the image above is not loading, please have a look at the [static version](docs/images/nf-core-rnaseq_metro_map_grey.png). 1. Merge re-sequenced FastQ files ([`cat`](http://www.linfo.org/cat.html)) 2. Auto-infer strandedness by subsampling and pseudoalignment ([`fq`](https://github.com/stjude-rust-labs/fq), [`Salmon`](https://combine-lab.github.io/salmon/)) 3. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 4. UMI extraction ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools)) 5. Adapter and quality trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) 6. Removal of genome contaminants ([`BBSplit`](http://seqanswers.com/forums/showthread.php?t=41288)) 7. Removal of ribosomal RNA ([`SortMeRNA`](https://github.com/biocore/sortmerna)) 8. Choice of multiple alignment and quantification routes (_For `STAR` the sentieon implementation can be chosen_): 1. [`STAR`](https://github.com/alexdobin/STAR) -> [`Salmon`](https://combine-lab.github.io/salmon/) 2. [`STAR`](https://github.com/alexdobin/STAR) -> [`RSEM`](https://github.com/deweylab/RSEM) 3. [`HiSAT2`](https://ccb.jhu.edu/software/hisat2/index.shtml) -> **NO QUANTIFICATION** 9. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) 10. UMI-based deduplication ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools)) 11. Duplicate read marking ([`picard MarkDuplicates`](https://broadinstitute.github.io/picard/)) 12. Transcript assembly and quantification ([`StringTie`](https://ccb.jhu.edu/software/stringtie/)) 13. Create bigWig coverage files ([`BEDTools`](https://github.com/arq5x/bedtools2/), [`bedGraphToBigWig`](http://hgdownload.soe.ucsc.edu/admin/exe/)) 14. Extensive quality control: 1. [`RSeQC`](http://rseqc.sourceforge.net/) 2. [`Qualimap`](http://qualimap.bioinfo.cipf.es/) 3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html) 4. [`Preseq`](http://smithlabresearch.org/software/preseq/) 5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) 6. [`Kraken2`](https://ccb.jhu.edu/software/kraken2/) -> [`Bracken`](https://ccb.jhu.edu/software/bracken/) on unaligned sequences; _optional_ 15. Pseudoalignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/) or ['Kallisto'](https://pachterlab.github.io/kallisto/); _optional_) 16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) > **Note** > The SRA download functionality has been removed from the pipeline (`>=3.2`) and ported to an independent workflow called [nf-core/fetchngs](https://nf-co.re/fetchngs). You can provide `--nf_core_pipeline rnaseq` when running nf-core/fetchngs to download and auto-create a samplesheet containing publicly available samples that can be accepted directly as input by this pipeline. > **Warning** > Quantification isn't performed if using `--aligner hisat2` due to the lack of an appropriate option to calculate accurate expression estimates from HISAT2 derived genomic alignments. However, you can use this route if you have a preference for the alignment, QC and other types of downstream analysis compatible with the output of HISAT2. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. First, prepare a samplesheet with your input data that looks as follows: **samplesheet.csv**: ```csv sample,fastq_1,fastq_2,strandedness CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,auto CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,auto CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz,auto ``` Each row represents a fastq file (single-end) or a pair of fastq files (paired end). Rows with the same sample identifier are considered technical replicates and merged automatically. The strandedness refers to the library preparation and will be automatically inferred if set to `auto`. > [!WARNING] > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). Now, you can run the pipeline using: ```bash nextflow run nf-core/rnaseq \\ --input \\ --outdir \\ --gtf \\ --fasta \\ -profile ``` For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/rnaseq/usage) and the [parameter documentation](https://nf-co.re/rnaseq/parameters). ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/rnaseq/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/rnaseq/output). This pipeline quantifies RNA-sequenced reads relative to genes/transcripts in the genome and normalizes the resulting data. It does not compare the samples statistically in order to assign significance in the form of FDR or P-values. For downstream analyses, the output files from this pipeline can be analysed directly in statistical environments like [R](https://www.r-project.org/), [Julia](https://julialang.org/) or via the [nf-core/differentialabundance](https://github.com/nf-core/differentialabundance/) pipeline. ## Online videos A short talk about the history, current status and functionality on offer in this pipeline was given by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) on [8th February 2022](https://nf-co.re/events/2022/bytesize-32-nf-core-rnaseq) as part of the nf-core/bytesize series. You can find numerous talks on the [nf-core events page](https://nf-co.re/events) from various topics including writing pipelines/modules in Nextflow DSL2, using nf-core tooling, running nf-core pipelines as well as more generic content like contributing to Github. Please check them out! ## Credits These scripts were originally written for use at the [National Genomics Infrastructure](https://ngisweden.scilifelab.se), part of [SciLifeLab](http://www.scilifelab.se/) in Stockholm, Sweden, by Phil Ewels ([@ewels](https://github.com/ewels)) and Rickard Hammarén ([@Hammarn](https://github.com/Hammarn)). The pipeline was re-written in Nextflow DSL2 and is primarily maintained by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/). The pipeline workflow diagram was initially designed by Sarah Guinchard ([@G-Sarah](https://github.com/G-Sarah)) and James Fellows Yates ([@jfy133](https://github.com/jfy133)), further modifications where made by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) and Maxime Garcia ([@maxulysse](https://github.com/maxulysse)). Many thanks to other who have helped out along the way too, including (but not limited to): - [Alex Peltzer](https://github.com/apeltzer) - [Colin Davenport](https://github.com/colindaven) - [Denis Moreno](https://github.com/Galithil) - [Edmund Miller](https://github.com/edmundmiller) - [Gregor Sturm](https://github.com/grst) - [Jacki Buros Novik](https://github.com/jburos) - [Lorena Pantano](https://github.com/lpantano) - [Matthias Zepper](https://github.com/MatthiasZepper) - [Maxime Garcia](https://github.com/maxulysse) - [Olga Botvinnik](https://github.com/olgabot) - [@orzechoj](https://github.com/orzechoj) - [Paolo Di Tommaso](https://github.com/pditommaso) - [Rob Syme](https://github.com/robsyme) ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#rnaseq` channel](https://nfcore.slack.com/channels/rnaseq) (you can join with [this invite](https://nf-co.re/join/slack)). ## Citations If you use nf-core/rnaseq for your analysis, please cite it using the following doi: [10.5281/zenodo.1400710](https://doi.org/10.5281/zenodo.1400710) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:image ; ns1:isBasedOn ; ns1:keywords "rna, rna-seq" ; ns1:license ; ns1:name "nf-core/rnaseq" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 37 . a ns1:Person ; ns1:name "" . a ns1:Person ; ns1:name "Jesse van Dam" . a ns1:Person ; ns1:name "Peter Schaap" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward primer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse primer" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sample name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_fastqc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "files_to_folder_ngtax" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-03-30T20:49:19Z"^^ns1:Date ; ns1:dateModified "2023-03-30T20:58:56Z"^^ns1:Date ; ns1:description """The containerised pipeline for profiling shotgun metagenomic data is derived from the [MGnify](https://www.ebi.ac.uk/metagenomics/) pipeline raw-reads analyses, a well-established resource used for analyzing microbiome data. Key components: - Quality control and decontamination - rRNA and ncRNA detection using Rfam database - Taxonomic classification of SSU and LSU regions - Abundance analysis with mOTUs""" ; ns1:image ; ns1:keywords "Nextflow, Metagenomics" ; ns1:license ; ns1:name "MGnify raw reads taxonomic profiling pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-04-06T07:54:59Z"^^ns1:Date ; ns1:dateModified "2023-04-06T07:54:59Z"^^ns1:Date ; ns1:description """# RASflow: RNA-Seq Analysis Snakemake Workflow RASflow is a modular, flexible and user-friendly RNA-Seq analysis workflow. RASflow can be applied to both model and non-model organisms. It supports mapping RNA-Seq raw reads to both genome and transcriptome (can be downloaded from public database or can be homemade by users) and it can do both transcript- and gene-level Differential Expression Analysis (DEA) when transcriptome is used as mapping reference. It requires little programming skill for basic use. If you're good at programming, you can do more magic with RASflow! You can help support RASflow by citing our publication: **Zhang, X., Jonassen, I. RASflow: an RNA-Seq analysis workflow with Snakemake. BMC Bioinformatics 21, 110 (2020). https://doi.org/10.1186/s12859-020-3433-x** """ ; ns1:image ; ns1:keywords "Transcriptomics" ; ns1:license ; ns1:name "RASflow: RNA-Seq Analysis Snakemake Workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-06-17T14:20:31Z"^^ns1:Date ; ns1:dateModified "2025-06-17T14:24:04Z"^^ns1:Date ; ns1:description """# Mobilome Annotation Pipeline (former MoMofy) Bacteria can acquire genetic material through horizontal gene transfer, allowing them to rapidly adapt to changing environmental conditions. These mobile genetic elements can be classified into three main categories: plasmids, phages, and integrative elements. Plasmids are mostly extrachromosmal; phages can be found extrachromosmal or as temperate phages (prophages); whereas integrons are stable inserted in the chromosome. Autonomous elements are those integrative elements capable of excising themselves from the chromosome and reintegrate elsewhere. They can use a transposase (like insertion sequences and transposons) or an integrase/excisionase (like ICEs and IMEs). The Mobilome Annotation Pipeline is a wrapper that integrates the output of different tools designed for the prediction of plasmids, phages, insertion sequences, and other autonomous integrative mobile genetic elements such as ICEs, IMEs and integrons in prokaryotic genomes and metagenomes. The output is a PROKKA gff file with extra entries for the mobilome. ## Contents - [ Workflow ](#wf) - [ Setup ](#sp) - [ Install and dependencies ](#install) - [ Usage ](#usage) - [ Inputs ](#in) - [ Outputs ](#out) - [ Tests ](#test) - [ Citation ](#cite) ## Workflow This workflow has the following main subworkflows: - Preprocessing: Rename and filter contigs, and run PROKKA annotation - Prediction: Run geNomad, ICEfinder, IntegronFinder, and ISEScan - Annotation: Generate extra-annotation for antimicrobial resistance genes (AMRFinderPlus) and other mobilome-related proteins (MobileOG). - Integration: Parse and integrate the outputs generated on `Prediction` and `Annotation` subworkflows. In this step optional results of VIRify v3.0.0 can be incorporated. MGEs <500 bp lengh and predictions with no genes are discarded. - Postprocessing: Write the mobilome fasta file, write a report of the location of AMR genes (either mobilome or chromosome), and generate three new GFF files: 1. `mobilome_clean.gff`: mobilome + associated CDSs 2. `mobilome_extra.gff`: mobilome + ViPhOGs/mobileOG annotated genes (note that ViPhOG annotation is generated by VIRify) 3. `mobilome_nogenes.gff`: mobilome only The output `mobilome_nogenes.gff` is validated in this subworkflow. ## Setup This workflow is built using [Nextflow](https://www.nextflow.io/). It uses Singularity containers making installation trivial and results highly reproducible. Explained in this section, there is one manual step required to build the singularity image for [ICEfinder](https://bioinfo-mml.sjtu.edu.cn/ICEfinder/index.php), as we can't distribute that software due to license issues. - Install [Nextflow version >=21.10](https://www.nextflow.io/docs/latest/getstarted.html#installation) - Install [Singularity](https://github.com/apptainer/singularity/blob/master/INSTALL.md) ## Install and dependencies To get a copy of the Mobilome Annotation Pipeline, clone this repo by: ```bash $ git clone https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline.git ``` The mobileOG-database is required to run an extra step of annotation on the mobilome coding sequences. The first time you run the Mobilome Annotation Pipeline, you will need to download the [Beatrix 1.6 v1](https://mobileogdb.flsi.cloud.vt.edu/entries/database_download) database, move the tarball to `mobilome-annotation-pipeline/databases`, decompress it, and run the script to format the db for diamond: ```bash $ mv beatrix-1-6_v1_all.zip /PATH/mobilome-annotation-pipeline/databases $ cd /PATH/mobilome-annotation-pipeline/databases $ unzip beatrix-1-6_v1_all.zip $ nextflow run /PATH/mobilome-annotation-pipeline/format_mobileOG.nf ``` Two additional databases need to be manually downloaded and extracted: [AMRFinder plus db](https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest) and the [geNomad database](https://zenodo.org/records/8339387) databases. Then you can provide the paths to your databases using the `mobileog_db`, the `amrfinder_plus_db` and the `genomad_db` respectively when you run the pipeline. Most of the tools are available on [quay.io](https://quay.io) and no install is needed. However, in the case of ICEfinder, you will need to contact the author to get a copy of the software, visit the [ICEfinder website](https://bioinfo-mml.sjtu.edu.cn/ICEfinder/download.html) for more information. Once you have the `ICEfinder_linux.tar.gz` tarball, move it to `mobilome-annotation-pipeline/templates` and build the singularity image using the following command: ```bash $ mv ICEfinder_linux.tar.gz /PATH/mobilome-annotation-pipeline/templates/ $ cd /PATH/mobilome-annotation-pipeline/templates/ $ sudo singularity build ../../singularity/icefinder-v1.0-local.sif icefinder-v1.0-local.def ``` The path to the ICEfinder image needs to be provided when running the pipeline, unless a custom config file is created. ## Inputs To run the Mobilome Annotation Pipeline on multiple samples, prepare a samplesheet with your input data that looks as in the following example. Note that `virify_gff` is an optional input for this pipeline generated with [VIRify](https://github.com/EBI-Metagenomics/emg-viral-pipeline) v3.0.0 tool. `samplesheet.csv`: ```csv sample,assembly,user_proteins_gff,virify_gff minimal,/PATH/assembly.fasta,, assembly_proteins,/PATH/assembly.fasta,/PATH/proteins.gff, assembly_proteins_virify,/PATH/assembly.fasta,/PATH/proteins.gff,/PATH/virify_out.gff ``` Each row represents a sample. The minimal input is the (meta)genome assembly in fasta format. Basic run: ```bash $ nextflow run /PATH/mobilome-annotation-pipeline/main.nf --input samplesheet.csv [--icefinder_sif icefinder-v1.0-local.sif] ``` Note that the final output in gff format is created by adding information to PROKKA output. If you have your own protein prediction files, provide the path the the uncompressed gff file in the samplesheet.csv. This file will be used to generate a `user_mobilome_extra.gff` file containing the mobilome plus any extra annotation generated on the annotation subworkflow. If you want to integrate VIRify results to the final output provide the path to the GFF file generated by VIRify v3.0.0 in your samplesheet.csv. ## Outputs Results will be written by default in the `mobilome_results` directory unless the `--outdir` option is used. There, you will find the following outputs: ```bash mobilome_results/ ├── mobilome.fasta ├── mobilome_prokka.gff ├── overlapping_integrons.txt ├── discarded_mge.txt ├── func_annot/ ├── gff_output_files/ ├── prediction/ └── preprocessing ``` The AMRFinderPlus results are generated by default. The `func_annot/amr_location.txt` file contains a summary of the AMR genes annotated and their location (either mobilome or chromosome). The file `discarded_mge.txt` contains a list of predictions that were discarded, along with the reason for their exclusion. Possible reasons include: 1. 'mge < 500bp' Discarded by length. 2. 'no_cds' If there are no genes encoded in the prediction. The file `overlapping_integrons.txt` is a report of long-MGEs with overlapping coordinates. No predictions are discarded in this case. The main output files containing the mobilome predictions are `mobilome.fasta` containing the nucleotide sequences of every prediction, and `mobilome_prokka.gff` containing the mobilome annotation plus any other feature annotated by PROKKA, mobileOG, or ViPhOG (only when VIRify results are provided). The mobilome prediction IDs are build as follows: 1. Contig ID 2. MGE type: flanking_site recombination_site prophage viral_sequence plasmid phage_plasmid integron conjugative_integron insertion_sequence 3. Start and end coordinates separated by ':' Example: ```bash >contig_id|mge_type-start:end ``` Any CDS with a coverage >= 0.9 in the boundaries of a predicted MGE is considered as part of the mobilome and labelled acordingly in the attributes field under the key `location`. The labels used in the Type column of the gff file corresponds to the following nomenclature according to the [Sequence Ontology resource](http://www.sequenceontology.org/browser/current_svn/term/SO:0000001) when possible: | Type in gff file | Sequence ontology ID | Element description | Reporting tool | | -------------------------------- | --------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------- | | insertion_sequence | [SO:0000973](http://www.sequenceontology.org/browser/current_svn/term/SO:0000973) | Insertion sequence | ISEScan, PaliDIS | | terminal_inverted_repeat_element | [SO:0000481](http://www.sequenceontology.org/browser/current_svn/term/SO:0000481) | Terminal Inverted Repeat (TIR) flanking insertion sequences | ISEScan, PaliDIS | | integron | [SO:0000365](http://www.sequenceontology.org/browser/current_svn/term/SO:0000365) | Integrative mobilizable element | IntegronFinder, ICEfinder | | attC_site | [SO:0000950](http://www.sequenceontology.org/browser/current_svn/term/SO:0000950) | Integration site of DNA integron | IntegronFinder | | conjugative_integron | [SO:0000371](http://www.sequenceontology.org/browser/current_svn/term/SO:0000371) | Integrative Conjugative Element | ICEfinder | | direct_repeat | [SO:0000314](http://www.sequenceontology.org/browser/current_svn/term/SO:0000314) | Flanking regions on mobilizable elements | ICEfinder | | prophage | [SO:0001006](http://www.sequenceontology.org/browser/current_svn/term/SO:0001006) | Temperate phage | geNomad, VIRify | | viral_sequence | [SO:0001041](http://www.sequenceontology.org/browser/current_svn/term/SO:0001041) | Viral genome fragment | geNomad, VIRify | | plasmid | [SO:0000155](http://www.sequenceontology.org/browser/current_svn/term/SO:0000155) | Plasmid | geNomad | ## Tests Nextflow tests are executed with [nf-test](https://github.com/askimed/nf-test). It takes around 3 min in executing. Run: ```bash $ cd mobilome-annotation-pipeline/ $ nf-test test ``` ## Citation The Mobilome Annotation Pipeline parses and integrates the output of the following tools and DBs sorted alphabetically: - AMRFinderPlus v3.11.4 with database v2023-02-23.1 [Feldgarden et al., Sci Rep, 2021](https://doi.org/10.1038/s41598-021-91456-0) - Diamond v2.0.12 [Buchfink et al., Nature Methods, 2021](https://doi.org/10.1038/s41592-021-01101-x) - geNomad v1.6.1 [Camargo et al., Nature Biotechnology, 2023](https://doi.org/10.1038/s41587-023-01953-y) - ICEfinder v1.0 [Liu et al., Nucleic Acids Res, 2019](https://doi.org/10.1093/nar/gky1123) - IntegronFinder2 v2.0.2 [Néron et al., Microorganisms, 2022](https://doi.org/10.3390/microorganisms10040700) - ISEScan v1.7.2.3 [Xie et al., Bioinformatics, 2017](https://doi.org/10.1093/bioinformatics/btx433) - MobileOG-DB Beatrix 1.6 v1 [Brown et al., Appl Environ Microbiol, 2022](https://doi.org/10.1128/aem.00991-22) - PROKKA v1.14.6 [Seemann, Bioinformatics, 2014](https://doi.org/10.1093/bioinformatics/btu153) - VIRify v3.0.0 [Rangel-Pineros et al., PLoS Comput Biol, 2023](https://doi.org/10.1371/journal.pcbi.1011422) """ ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Mobilome, Genomics, Metagenomics, Nextflow, MGE" ; ns1:license ; ns1:name "Mobilome Annotation Pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-07-05T13:47:29Z"^^ns1:Date ; ns1:dateModified "2023-07-11T10:35:35Z"^^ns1:Date ; ns1:description """# BatchConvert ![DOI:10.5281](https://zenodo.org/badge/doi/10.5281/zenodo.7955974.svg) A command line tool for converting image data into either of the standard file formats OME-TIFF or OME-Zarr. The tool wraps the dedicated file converters bfconvert and bioformats2raw to convert into OME-TIFF or OME-Zarr, respectively. The workflow management system NextFlow is used to perform conversion in parallel for batches of images. The tool also wraps s3 and Aspera clients (go-mc and aspera-cli, respectively). Therefore, input and output locations can be specified as local or remote storage and file transfer will be performed automatically. The conversion can be run on HPC with Slurm. ![](figures/diagram.png) ## Installation & Dependencies **Important** note: The package has been so far only tested on Ubuntu 20.04. The minimal dependency to run the tool is NextFlow, which should be installed and made accessible from the command line. If conda exists on your system, you can install BatchConvert together with NextFlow using the following script: ``` git clone https://github.com/Euro-BioImaging/BatchConvert.git && \\ source BatchConvert/installation/install_with_nextflow.sh ``` If you already have NextFlow installed and accessible from the command line (or if you prefer to install it manually e.g., as shown [here](https://www.nextflow.io/docs/latest/getstarted.html)), you can also install BatchConvert alone, using the following script: ``` git clone https://github.com/Euro-BioImaging/BatchConvert.git && \\ source BatchConvert/installation/install.sh ``` Other dependencies (which will be **automatically** installed): - bioformats2raw (entrypoint bioformats2raw) - bftools (entrypoint bfconvert) - go-mc (entrypoint mc) - aspera-cli (entrypoint ascp) These dependencies will be pulled and cached automatically at the first execution of the conversion command. The mode of dependency management can be specified by using the command line option ``--profile`` or `-pf`. Depending on how this option is specified, the dependencies will be acquired / run either via conda or via docker/singularity containers. Specifying ``--profile conda`` (default) will install the dependencies to an environment at ``./.condaCache`` and use this environment to run the workflow. This option requires that miniconda/anaconda is installed on your system. Alternatively, specifying ``--profile docker`` or ``--profile singularity`` will pull a docker or singularity image with the dependencies, respectively, and use this image to run the workflow. These options assume that the respective container runtime (docker or singularity) is available on your system. If singularity is being used, a cache directory will be created at the path ``./.singularityCache`` where the singularity image is stored. Finally, you can still choose to install the dependencies manually and use your own installations to run the workflow. In this case, you should specify ``--profile standard`` and make sure the entrypoints specified above are recognised by your shell. ## Configuration BatchConvert can be configured to have default options for file conversion and transfer. Probably, the most important sets of parameters to be configured include credentials for the remote ends. The easiest way to configure remote stores is by running the interactive configuration command as indicated below. ### Configuration of the s3 object store Run the interactive configuration command: `batchconvert configure_s3_remote` This will start a sequence of requests for s3 credentials such as name, url, access, etc. Provide each requested credential and click enter. Continue this cycle until the process is finished. Upon completing the configuration, the sequence of commands should roughly look like this: ``` oezdemir@pc-ellenberg108:~$ batchconvert configure_s3_remote enter remote name (for example s3) s3 enter url: https://s3.embl.de enter access key: "your-access-key" enter secret key: "your-secret-key" enter bucket name: "your-bucket" Configuration of the default s3 credentials is complete ``` ### Configuration of the BioStudies user space Run the interactive configuration command: `batchconvert configure_bia_remote` This will prompt a request for the secret directory to connect to. Enter the secret directory for your user space and click enter. Upon completing the configuration, the sequence of commands should roughly look like this: ``` oezdemir@pc-ellenberg108:~$ batchconvert configure_bia_remote enter the secret directory for BioImage Archive user space: "your-secret-directory" configuration of the default bia credentials is complete ``` ### Configuration of the slurm options BatchConvert can also run on slurm clusters. In order to configure the slurm parameters, run the interactive configuration command: `batchconvert configure_slurm` This will start a sequence of requests for slurm options. Provide each requested option and click enter. Continue this cycle until the process is finished. Upon completing the configuration, the sequence of commands should roughly look like this: ``` oezdemir@pc-ellenberg108:~$ batchconvert configure_slurm Please enter value for queue_size Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the current value ´50´ s Please enter value for submit_rate_limit Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the current value ´10/2min´ s Please enter value for cluster_options Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the current value ´--mem-per-cpu=3140 --cpus-per-task=16´ s Please enter value for time Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the current value ´6h´ s configuration of the default slurm parameters is complete ``` ### Configuration of the default conversion parameters While all conversion parameters can be specified as command line arguments, it can be useful for the users to set their own default parameters to avoid re-entering those parameters for subsequent executions. BatchConvert allows for interactive configuration of conversion in the same way as configuration of the remote stores described above. To configure the conversion into OME-TIFF, run the following command: `batchconvert configure_ometiff` This will prompt the user to enter a series of parameters, which will then be saved as the default parameters to be passed to the `batchconvert ometiff` command. Upon completing the configuration, the sequence of commands should look similar to: ``` oezdemir@pc-ellenberg108:~$ batchconvert configure_ometiff Please enter value for noflat Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bfconvert defaults" s Please enter value for series Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bfconvert defaults" s Please enter value for timepoint Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bfconvert defaults" s ... ... ... ... ... ... Configuration of the default parameters for 'bfconvert' is complete ``` To configure the conversion into OME-Zarr, run the following command: `batchconvert configure_omezarr` Similarly, this will prompt the user to enter a series of parameters, which will then be saved as the default parameters to be passed to the `batchconvert omezarr` command. Upon completing the configuration, the sequence of commands should look similar to: ``` oezdemir@pc-ellenberg108:~$ batchconvert configure_omezarr Please enter value for resolutions_zarr Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bioformats2raw defaults" s Please enter value for chunk_h Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bioformats2raw defaults" s Please enter value for chunk_w Click enter if this parameter is not applicable Enter "skip" or "s" if you would like to keep the parameter´s current value, which is "bioformats2raw defaults" ... ... ... ... ... ... Configuration of the default parameters for 'bioformats2raw' is complete ``` It is important to note that the initial defaults for the conversion parameters are the same as the defaults of the backend tools bfconvert and bioformats2raw, as noted in the prompt excerpt above. Through interactive configuration, the user is overriding these initial defaults and setting their own defaults. It is possible to reset the initial defaults by running the following command. `batchconvert reset_defaults` Another important point is that any of these configured parameters can be overridden by passing a value to that parameter in the commandline. For instance, in the following command, the value of 20 will be assigned to `chunk_h` parameter even if the value for the same parameter might be different in the configuration file. `batchconvert omezarr --chunk_h 20 "path/to/input" "path/to/output"` ## Examples ### Local conversion #### Parallel conversion of files to separate OME-TIFFs / OME-Zarrs: Convert a batch of images on your local storage into OME-TIFF format. Note that the `input_path` in the command given below is typically a directory with multiple image files but a single image file can also be passed:\\ `batchconvert ometiff -pf conda "input_path" "output_path"` Note that if this is your first conversion with the profile `conda`, it will take a while for a conda environment with the dependencies to be created. All the subsequent conversion commands with the profile `conda`, however, will use this environment, and thus show no such delay. Since conda is the default profile, it does not have to be explicitly included in the command line. Thus, the command can be shortened to:\\ `batchconvert ometiff "input_path" "output_path"` Convert only the first channel of the images:\\ `batchconvert ometiff -chn 0 "input_path" "output_path"` Crop the images being converted along x and y axis by 150 pixels:\\ `batchconvert ometiff -cr 0,0,150,150 "input_path" "output_path"` Convert into OME-Zarr instead:\\ `batchconvert omezarr "input_path" "output_path"` Convert into OME-Zarr with 3 resolution levels:\\ `batchconvert omezarr -rz 3 "input_path" "output_path"` Select a subset of images with a matching string such as "mutation":\\ `batchconvert omezarr -p mutation "input_path" "output_path"` Select a subset of images using wildcards. Note that the use of "" around the input path is necessary when using wildcards:\\ `batchconvert omezarr "input_path/*D3*.oir" "output_path"` Convert by using a singularity container instead of conda environment (requires singularity to be installed on your system):\\ `batchconvert omezarr -pf singularity "input_path/*D3*.oir" "output_path"` Convert by using a docker container instead of conda environment (requires docker to be installed on your system):\\ `batchconvert omezarr -pf docker "input_path/*D3*.oir" "output_path"` Note that similarly to the case with the profile `conda`, the first execution of a conversion with the profile `singularity` or `docker` will take a while for the container image to be pulled. All the subsequent conversion commands using a container option will use this image, and thus show no such delay. Convert local data and upload the output to an s3 bucket. Note that the output path is created relative to the bucket specified in your s3 configuration:\\ `batchconvert omezarr -dt s3 "input_path" "output_path"` Receive input files from an s3 bucket, convert locally and upload the output to the same bucket. Note that wildcards cannot be used when the input is from s3. Use pattern matching option `-p` for selecting a subset of input files:\\ `batchconvert omezarr -p mutation -st s3 -dt s3 "input_path" "output_path"` Receive input files from your private BioStudies user space and convert them locally. Use pattern matching option `-p` for selecting a subset of input files:\\ `batchconvert omezarr -p mutation -st bia "input_path" "output_path"` Receive an input from an s3 bucket, convert locally and upload the output to your private BioStudies user space. Use pattern matching option `-p` for selecting a subset of input files:\\ `batchconvert omezarr -p mutation -st s3 -dt bia "input_path" "output_path"` Note that in all the examples shown above, BatchConvert treats each input file as separate, standalone data point, disregarding the possibility that some of the input files might belong to the same multidimensional array. Thus, each input file is converted to an independent OME-TIFF / OME-Zarr and the number of outputs will thus equal the number of selected input files. An alternative scenario is discussed below. #### Parallel conversion of file groups by stacking multiple files into single OME-TIFFs / OME-Zarrs: When the flag `--merge_files` is specified, BatchConvert tries to detect which input files might belong to the same multidimensional array based on the patterns in the filenames. Then a "grouped conversion" is performed, meaning that the files belonging to the same dataset will be incorporated into a single OME-TIFF / OME-Zarr series, in that files will be concatenated along specific dimension(s) during the conversion. Multiple file groups in the input directory can be detected and converted in parallel. This feature uses Bio-Formats's pattern files as described [here](https://docs.openmicroscopy.org/bio-formats/6.6.0/formats/pattern-file.html). However, BatchConvert generates pattern files automatically, allowing the user to directly use the input directory in the conversion command. BatchConvert also has the option of specifying the concatenation axes in the command line, which is especially useful in cases where the filenames may not contain dimension information. To be able to use the `--merge files` flag, the input file names must obey certain rules: 1. File names in the same group must be uniform, except for one or more **numeric field(s)**, which should show incremental change across the files. These so-called **variable fields** will be detected and used as the dimension(s) of concatenation. 2. The length of variable fields must be uniform within the group. For instance, if the variable field has values reaching multi-digit numbers, leading "0"s should be included where needed in the file names to make the variable field length uniform within the group. 3. Typically, each variable field should follow a dimension specifier. What patterns can be used as dimension specifiers are explained [here](https://docs.openmicroscopy.org/bio-formats/6.6.0/formats/pattern-file.html). However, BatchConvert also has the option `--concatenation_order`, which allows the user to specify from the command line, the dimension(s), along which the files must be concatenated. 4. File names that are unique and cannot be associated with any group will be assumed as standalone images and converted accordingly. Below are some examples of grouped conversion commands in the context of different possible use-case scenarios: **Example 1:** This is an example of a folder with non-uniform filename lengths: ``` time-series/test_img_T2 time-series/test_img_T4 time-series/test_img_T6 time-series/test_img_T8 time-series/test_img_T10 time-series/test_img_T12 ``` In this example, leading zeroes are missing in the variable fields of some filenames. A typical command to convert this folder to a single OME-TIFF would look like: \\ `batchconvert --ometiff --merge_files "input_dir/time-series" "output_path"` However, this command would fail to create a single OME-Zarr folder due to the non-uniform lengths of the filenames. Instead, the files would be split into two groups based on the filename length, leading to two separate OME-Zarrs with names: `test_img_TRange{2-8-2}.ome.zarr` and `test_img_TRange{10-12-2}.ome.zarr` Here is the corrected version of the folder for the above example- ``` time-series/test_img_T02 time-series/test_img_T04 time-series/test_img_T06 time-series/test_img_T08 time-series/test_img_T10 time-series/test_img_T12 ``` Executing the same command on this folder would result in a single OME-Zarr with the name: `test_img_TRange{02-12-2}.ome.zarr` **Example 2**- In this example, the filename lengths are uniform but the incrementation within the variable field is not. ``` time-series/test_img_T2 time-series/test_img_T4 time-series/test_img_T5 time-series/test_img_T7 ``` A typical command to convert this folder to a single OME-Zarr would look like: \\ `batchconvert --omezarr --merge_files "input_dir/time-series" "output_path"` However, the command would fail to assume these files as a single group due to the non-uniform incrementation in the variable field of the filenames. Instead, the dataset would be split into two groups, leading to two separate OME-Zarrs with the following names: `test_img_TRange{2-4-2}.ome.zarr` and `test_img_TRange{5-7-2}.ome.zarr` **Example 3** This is an example of a case where the conversion attempts to concatenate files along two dimensions, channel and time. ``` multichannel_time-series/test_img_C1-T1 multichannel_time-series/test_img_C1-T2 multichannel_time-series/test_img_C1-T3 multichannel_time-series/test_img_C2-T1 multichannel_time-series/test_img_C2-T2 ``` To convert this folder to a single OME-Zarr, one could try the following command: \\ `batchconvert --omezarr --merge_files "input_dir/multichannel_time-series" "output_path"` However, since the channel-2 does not have the same number of timeframes as the channel-1, BatchConvert will fail to assume these two channels as part of the same series and will instead split the two channels into two separate OME-Zarrs. The output would look like: \\ `test_img_C1-TRange{1-3-1}.ome.zarr` \\ `test_img_C2-TRange{1-2-1}.ome.zarr` To be able to really incorporate all files into a single OME-Zarr, the folder should have equal number of images corresponding to both channels, as shown below: ``` multichannel_time-series/test_img_C1-T1 multichannel_time-series/test_img_C1-T2 multichannel_time-series/test_img_C1-T3 multichannel_time-series/test_img_C2-T1 multichannel_time-series/test_img_C2-T2 multichannel_time-series/test_img_C2-T3 ``` The same conversion command on this version of the input folder would result in a single OME-Zarr with the name: \\ `test_img_CRange{1-2-1}-TRange{1-3-1}.ome.zarr` **Example 4** This is another example of a case, where there are multiple filename patterns in the input folder. ``` folder_with_multiple_groups/test_img_C1-T1 folder_with_multiple_groups/test_img_C1-T2 folder_with_multiple_groups/test_img_C2-T1 folder_with_multiple_groups/test_img_C2-T2 folder_with_multiple_groups/test_img_T1-Z1 folder_with_multiple_groups/test_img_T1-Z2 folder_with_multiple_groups/test_img_T1-Z3 folder_with_multiple_groups/test_img_T2-Z1 folder_with_multiple_groups/test_img_T2-Z2 folder_with_multiple_groups/test_img_T2-Z3 ``` One can convert this folder with- \\ `batchconvert --omezarr --merge_files "input_dir/folder_with_multiple_groups" "output_path"` BatchConvert will detect the two patterns in this folder and perform two grouped conversions. The output folders will be named as `test_img_CRange{1-2-1}-TRange{1-2-1}.ome.zarr` and `test_img_TRange{1-2-1}-ZRange{1-3-1}.ome.zarr`. **Example 5** Now imagine that we have the same files as in the example 4 but the filenames of the first group lack any dimension specifier, so we have the following folder: ``` folder_with_multiple_groups/test_img_1-1 folder_with_multiple_groups/test_img_1-2 folder_with_multiple_groups/test_img_2-1 folder_with_multiple_groups/test_img_2-2 folder_with_multiple_groups/test_img_T1-Z1 folder_with_multiple_groups/test_img_T1-Z2 folder_with_multiple_groups/test_img_T1-Z3 folder_with_multiple_groups/test_img_T2-Z1 folder_with_multiple_groups/test_img_T2-Z2 folder_with_multiple_groups/test_img_T2-Z3 ``` In such a scenario, BatchConvert allows the user to specify the concatenation axes via `--concatenation_order` option. This option expects comma-separated strings of dimensions for each group. In this example, the user must provide a string of 2 characters, such as `ct` for channel and time, for group 1, since there are two variable fields for this group. Since group 2 already has dimension specifiers (T and Z as specified in the filenames preceding the variable fields), the user does not need to specify anything for this group, and can enter `auto` or `aa` for automatic detection of the specifiers. So the following line can be used to convert this folder: \\ `batchconvert --omezarr --merge_files --concatenation_order ct,aa "input_dir/folder_with_multiple_groups" "output_path"` The resulting OME-Zarrs will have the names: `test_img_CRange{1-2-1}-TRange{1-2-1}.ome.zarr` and `test_img_TRange{1-2-1}-ZRange{1-3-1}.ome.zarr` Note that `--concatenation_order` will override any dimension specifiers already existing in the filenames. **Example 6** There can be scenarios where the user may want to have further control over the axes along which to concatenate the images. For example, the filenames might contain the data acquisition date, which can be recognised by BatchConvert as a concatenation axis in the automatic detection mode. An example of such a fileset might look like: ``` filenames_with_dates/test_data_date03.03.2023_imageZ1-T1 filenames_with_dates/test_data_date03.03.2023_imageZ1-T2 filenames_with_dates/test_data_date03.03.2023_imageZ1-T3 filenames_with_dates/test_data_date03.03.2023_imageZ2-T1 filenames_with_dates/test_data_date03.03.2023_imageZ2-T2 filenames_with_dates/test_data_date03.03.2023_imageZ2-T3 filenames_with_dates/test_data_date04.03.2023_imageZ1-T1 filenames_with_dates/test_data_date04.03.2023_imageZ1-T2 filenames_with_dates/test_data_date04.03.2023_imageZ1-T3 filenames_with_dates/test_data_date04.03.2023_imageZ2-T1 filenames_with_dates/test_data_date04.03.2023_imageZ2-T2 filenames_with_dates/test_data_date04.03.2023_imageZ2-T3 ``` One may try the following command to convert this folder: `batchconvert --omezarr --merge_files "input_dir/filenames_with_dates" "output_path"` Since the concatenation axes are not specified, this command would try to create a single OME-Zarr with name: `test_data_dateRange{03-04-1}.03.2023_imageZRange{1-2-1}-TRange{1-3-1}`. In order to force BatchConvert to ignore the date field, the user can restrict the concatenation axes to the last two numeric fields. This can be done by using a command such as: \\ `batchconvert --omezarr --merge_files --concatenation_order aa "input_dir/filenames_with_dates" "output_path"` \\ This command will avoid concatenation along the date field, and therefore, there will be two OME-Zarrs corresponding to the two dates. The number of characters being passed to the `--concatenation_order` option specifies the number of numeric fields (starting from the right end of the filename) that are recognised by the BatchConvert as valid concatenation axes. Passing `aa`, therefore, means that the last two numeric fields must be recognised as concatenation axes and the dimension type should be automatically detected (`a` for automatic). In the same logic, one could, for example, convert each Z section into a separate OME-Zarr by specifying `--concatenation_order a`. ### Conversion on slurm All the examples given above can also be run on slurm by specifying `-pf cluster` option. Note that this option automatically uses the singularity profile:\\ `batchconvert omezarr -pf cluster -p .oir "input_path" "output_path"` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.453.3" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Nextflow, bash, Python, NGFF, OME-Zarr, Conversion, imaging, bioimaging, image file format, file conversion, OME-TIFF, S3, BioStudies, bioformats, bioformats2raw" ; ns1:license ; ns1:name "BatchConvert" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-04-12T09:18:18Z"^^ns1:Date ; ns1:dateModified "2023-04-13T06:50:58Z"^^ns1:Date ; ns1:description """**Assembly and quantification metatranscriptome using metagenome data**. Version: see VERSION ## Introduction **MetaGT** is a bioinformatics analysis pipeline used for improving and quantification metatranscriptome assembly using metagenome data. The pipeline supports Illumina sequencing data and complete metagenome and metatranscriptome assemblies. The pipeline involves the alignment of metatranscriprome assembly to the metagenome assembly with further extracting CDSs, which are covered by transcripts. The pipeline is built using Nextflow, a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The Nextflow DSL2 implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](https://bioconda.github.io/) ## Quick Start 1. Install [`nextflow`](https://nf-co.re/usage/installation) 2. Install any of [`Conda`](https://conda.io/miniconda.html) for full pipeline reproducibility 3. Download the pipeline, e.g. by cloning metaGT GitHub repository: ```bash git clone git@github.com:ablab/metaGT.git ``` 4. Test it on a minimal dataset by running: ```bash nextflow run metaGT -profile test,conda ``` 5. Start running your own analysis! > Typical command for analysis using reads: ```bash nextflow run metaGT -profile --dna_reads '*_R{1,2}.fastq.gz' --rna_reads '*_R{1,2}.fastq.gz' ``` > Typical command for analysis using multiple files with reads: ```bash nextflow run metaGT -profile --dna_reads '*.yaml' --rna_reads '*.yaml' --yaml ``` > Typical command for analysis using assemblies: ```bash nextflow run metaGT -profile --genome '*.fasta' --transcriptome '*.fasta' ``` ## Pipeline Summary Optionally, if raw reades are used: * Sequencing quality control (`FastQC`) * Assembly metagenome or metatranscriptome (`metaSPAdes, rnaSPAdes `) By default, the pipeline currently performs the following: * Annotation metagenome (`Prokka`) * Aligning metatranscriptome on metagenome (`minimap2`) * Annotation unaligned transcripts (`TransDecoder`) * Clustering covered CDS and CDS from unaligned transcripts (`MMseqs2`) * Quantifying abundances of transcripts (`kallisto`) ## Citation MetaGT was developed by Daria Shafranskaya and Andrey Prjibelski. If you use it in your research please cite: [MetaGT: A pipeline for de novo assembly of metatranscriptomes with the aid of metagenomic data](https://doi.org/10.3389/fmicb.2022.981458) ## Feedback and bug report If you have any questions, please leave an issue at out [GitHub page](https://github.com/ablab/metaGT/issues). """ ; ns1:keywords "Metagenomics, metatranscriptomics, expression, Multi-omics" ; ns1:license ; ns1:name "MetaGT: A pipeline for de novo assembly of metatranscriptomes with the aid of metagenomic data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "API key for CDS service" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Longitude for right-edge of domain" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "emepcores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Day for end date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Month for end date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Year for end date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "generate_metdir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "generate_rundir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Geographic inputs for geogrid" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Geogrid data table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HTTPS proxy information, if needed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EMEP Input Files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Directory name for WRF input Files, should match 'meteo' base-directory in namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EMEP configuration file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Geogrid namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metgrid configuration" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Real preprocessor Configuration File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Configuration File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Configuration File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "WRF Configuration File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Latitude for top of domain" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "outname_atm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "outname_sfc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "realcores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EMEP run label, for output files, should match 'runlabel1' in namelist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Latitude for bottom of domain" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Day for starting date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Month for starting date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Year for starting date" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "grib variable table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "grib variable table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Longitude for left-edge of domain" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "wrfcores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output files" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-04-12T12:04:44Z"^^ns1:Date ; ns1:dateModified "2023-04-12T12:45:16Z"^^ns1:Date ; ns1:description """# WRF/EMEP Linear Workflow Example Common Workflow Language (CWL) workflow and tool descriptors for running the Weather Research and Forecase (WRF) and EMEP models. This workflow is designed for a single model domain. Example datasets for testing this workflow can be downloaded from Zenodo. ## Requirements: * docker or singularity * conda * cwltool * Toil - optional, useful for running on HPC or distributed computing systems ### CWL / Toil Installation: The workflow runner (either cwltool, or Toil) can be installed using either conda or pip. Environment files for conda are included, and can be used as shown below: * cwltool only: * `conda env create --file install/env_cwlrunner.yml --name cwl` * Toil & cwltool: * `conda env create --file install/env_toil.yml --name toil` ### Setup for Example Workflow * Download the example dataset from Zenodo: https://doi.org/10.5281/zenodo.7817216 * Extract into the `input_files` directory: * `tar -zxvf wrf_emep_UK_example_inputs.tar.gz -C input_files --strip-components=1` ## Running the Workflow The full workflow is broken into several logical steps: 1. ERA5 download 2. WPS 1st step: Geogrid geography file creation 3. WPS process: ungribbing of ERA5 data, and running of metgrid to produce meteorology files. 4. WRF process: generation of WRF input files by REAL, and running of WRF model 5. EMEP model: running of EMEP chemistry and transport model Steps 1 and 3 require you to register with the CDS service, in order to download ERA5 data before using in the WPS process. Steps 2 and 5 require you to download extra input data - the instructions on how to do this are included in the README.txt files in the relevant input data directories. A full workflow for all steps is provided here. But each separate step can by run on it's own too, following the instructions given below. We recommend running step 4 first, to explore how the REAL & WRF workflow works, before trying the other steps. ### 1. ERA5 download. Before running the ERA5 download tool, ensure that you have reqistered for the CDS service, signed the ERA5 licensing agreement, and saved the CDS API key (`.cdsapirc`) in your working directory. To run the ERA5 download tool use the following command: ``` cwltool [--cachdir CACHE] [--singularity] workflows/era5_workflow.cwl example_workflow_configurations/era5_download_settings.yaml ``` Note that the `--cachedir CACHE` option sets the working directory cache, which enables the reuse of any steps previously run (and the restarting of the workflow from this point). The `--singularity` option is needed if you are using singularity instead of docker. ### 2. WPS: Geogrid geography file creation Before running the geogrid tool you will need to download the geography data from the [UCAR website](https://www2.mmm.ucar.edu/wrf/users/download/get_sources_wps_geog.html). These should be extracted into the `input_files/geogrid_geog_input` directory. To run the geogrid program use the following command: ``` cwltool [--cachdir CACHE] [--singularity] workflows/geogrid_workflow.cwl example_workflow_configurations/wps_geogrid_cwl_settings.yaml ``` ### 3. WPS: Creation of meteorology input files Before running the WPS process you will have to download the ERA5 datafiles (which will be called `preslev_[YYYYMMDD].grib` and `surface_[YYYYMMDD].grib`) and copy these to the directory `input_files/wps_era5_input`. If you have also run geogrid in step 2 you can replace the `geo_em.d01.nc` file in the `input_files/wps_geogrid_input` directory with the file that geogrid created. To run the wps metgrid process use the following command: ``` cwltool [--cachdir CACHE] [--singularity] workflows/wps_workflow.cwl example_workflow_configurations/wps_metgrid_cwl_settings.yaml ``` ### 4. WRF: Creation of WRF input files, and running WRF model The WRF model can be run without any prepreparation, except for the downloading of the input data from Zenodo. However, if you have created new meteorology files (`met_em*`) using WPS you can replace the files in the `input_files/wrf_met_input` directory with these. To run the WRF process (including REAL) use the following command: ``` cwltool [--cachdir CACHE] [--singularity] workflows/wrf_workflow.cwl example_workflow_configurations/wrf_real_cwl_settings.yaml ``` ### 5. EMEP: Running EMEP chemistry and transport model Before running the EMEP model you will need to download the EMEP input dataset. This can be done using the `catalog.py` tool, following the instructions in the `input_files/emep_input/README.txt` file. If you have run WRF you can also replace the `wrfout*` data files in the `input_Files/emep_wrf_input` directory with those you have created. To run the EMEP model use the following command: ``` cwltool [--cachdir CACHE] [--singularity] workflows/emep_workflow.cwl example_workflow_configurations/emep_cwl_settings.yaml ``` ### Full Workflow Before running the full workflow make sure you have carried out the setup tasks described above. To run the full workflow use the following command: ``` cwltool [--cachdir CACHE] [--singularity] wrf_emep_full_workflow.cwl example_workflow_configurations/wrf_emep_full_workflow_cwl_settings.yaml ``` ## Notes ### WRF filenames In order to work with singularity, all filenames need to exclude special characters. To ensure that all WRF filenames comply with this requirement, you will need to add the `nocolons = .true.` option to your WPS, REAL and WRF namelists to ensure this. ### MPI parallel processing The WPS processes all run in single thread mode. REAL, WRF and EMEP have been compiled with MPI support. The default cores for each of these is 2, 9 and 9, respectively. The settings file can be edited to modify these requirements. ### Caching intermediate workflow steps To cache the data from individual steps you can use the `--cachedir ` optional flag. ## License and Copyright These workflow scripts have been developed by the [Research IT](https://research-it.manchester.ac.uk/) at the [University of Manchester](https://www.manchester.ac.uk/). Copyright 2023 [University of Manchester, UK](https://www.manchester.ac.uk/). Licensed under the MIT license, see the LICENSE file for details.""" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "" ; ns1:license ; ns1:name "WRF / EMEP Linear Workflow" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-04-14T01:48:47Z"^^ns1:Date ; ns1:dateModified "2023-04-14T01:50:33Z"^^ns1:Date ; ns1:description "cccccc" ; ns1:keywords "" ; ns1:license ; ns1:name "Formula" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Pi" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-04-14T02:26:04Z"^^ns1:Date ; ns1:dateModified "2023-04-14T02:26:04Z"^^ns1:Date ; ns1:description "" ; ns1:keywords "" ; ns1:license ; ns1:name "PyUtils" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2020-10-28T11:07:26Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:43:45Z"^^ns1:Date ; ns1:description """Amplicon analysis workflow using NG-Tax **Steps:** * Quality control on the reads * Execute NGTax for ASV detection and classification For more information about NG-Tax 2.0 have a look at https://doi.org/10.3389/fgene.2019.01366""" ; ns1:image ; ns1:input , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Amplicon, 16S, ITS" ; ns1:license ; ns1:name "NGTax" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-04-27T13:43:13Z"^^ns1:Date ; ns1:dateModified "2023-06-16T06:10:32Z"^^ns1:Date ; ns1:description """# Gene similariy anaylsis across physiological systems in IMPC phenotype data A Jupyter Notebook tool for analysing user specified genes across the different physiological systems in IMPC data. **_Input_** The tool takes as input a list of gene ids (MGI ids or Gene Symbol ids). The elemnts in the list could be separated by a comma, semicolumn, tab or newline. **_Operation_** The program will create an heatmap representing the number of phenotypes and the mp term list for each gene contained in an [IMPC physiological system](https://www.mousephenotype.org/help/data-visualization/gene-pages/phenogrid/). Using the slider, adjust the treshold to set the minimum count to be displayed in the heatmap. NB: Genes without phenotypes in any physiological system will not be displayed. Also, the labels of the heatmap will use Gene Symbols independently from the type of id used in the input. **_Tool access:_** [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AndreaFurlani/Jupyter_interactive_plots/main?urlpath=voila%2Frender%2FInteractive_plots.ipynb)""" ; ns1:keywords "jupyter" ; ns1:license ; ns1:name "Gene similariy anaylsis across physiological systems in IMPC phenotype data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-04-27T13:57:37Z"^^ns1:Date ; ns1:dateModified "2023-05-25T09:07:40Z"^^ns1:Date ; ns1:description """# **Phenotype similarity analysis** A Jupyter Notebook for analyzing phenotyping similarities across user specified genes. Phenotypes are retrieved from the MGI resource **_Input_** The tool takes as input a list of gene ids (MGI ids or Gene Symbol ids). The elemnts in the list could be separated by a comma, semicolumn, tab or newline. **_Operation_** The Notebook will create a table where row and columns names are the Gene Symbols of the input elements and each cell will contain the name of the common phenotypes shared by those genes. Then an interactive heatmap will be displayed, showing also the count of those phenotypes. Using the slider, adjust the treshold to set the minimum count to be displayed in the heatmap. NB: Genes with only counts below the treshold will be not displayed in the heatmap. Also, the labels of the heatmap will use Gene Symbols independently from the type of id used in the input. **_Tool access:_** [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/AndreaFurlani/Jupyter_alliance/main?urlpath=voila%2Frender%2FAlliance_API_query.ipynb) """ ; ns1:keywords "" ; ns1:license ; ns1:name "Mouse phenotype similarity analyis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-12-03T15:41:02Z"^^ns1:Date ; ns1:dateModified "2024-12-03T15:41:02Z"^^ns1:Date ; ns1:description """# MGnify genomes catalogue pipeline [MGnify](https://www.ebi.ac.uk/metagenomics/) A pipeline to perform taxonomic and functional annotation and to generate a catalogue from a set of isolate and/or metagenome-assembled genomes (MAGs) using the workflow described in the following publication: Gurbich TA, Almeida A, Beracochea M, Burdett T, Burgin J, Cochrane G, Raj S, Richardson L, Rogers AB, Sakharova E, Salazar GA and Finn RD. (2023) [MGnify Genomes: A Resource for Biome-specific Microbial Genome Catalogues.](https://www.sciencedirect.com/science/article/pii/S0022283623000724) J Mol Biol. doi: https://doi.org/10.1016/j.jmb.2023.168016 Detailed information about existing MGnify catalogues: https://docs.mgnify.org/src/docs/genome-viewer.html ### Tools used in the pipeline | Tool/Database | Version | Purpose | |--------------------------------------------------------------------------------------------------|-------------------|------------------------------------------------------------------------------------------------------------------------| | CheckM2 | 1.0.1 | Determining genome quality | | dRep | 3.2.2 | Genome clustering | | Mash | 2.3 | Sketch for the catalogue; placement of genomes into clusters (update only); strain tree | | GUNC | 1.0.3 | Quality control | | GUNC DB | 2.0.4 | Database for GUNC | | GTDB-Tk | 2.4.0 | Assigning taxonomy; generating alignments | | GTDB | r220 | Database for GTDB-Tk | | Prokka | 1.14.6 | Protein annotation | | IQ-TREE 2 | 2.2.0.3 | Generating a phylogenetic tree | | Kraken 2 | 2.1.2 | Generating a kraken database | | Bracken | 2.6.2 | Generating a bracken database | | MMseqs2 | 13.45111 | Generating a protein catalogue | | eggNOG-mapper | 2.1.11 | Protein annotation (eggNOG, KEGG, COG, CAZy) | | eggNOG DB | 5.0.2 | Database for eggNOG-mapper | | Diamond | 2.0.11 | Protein annotation (eggNOG) | | InterProScan | 5.62-94.0 | Protein annotation (InterPro, Pfam) | | kegg-pathways-completeness tool | 1.0.5 | Computes KEGG pathway completeness | | CRISPRCasFinder | 4.3.2 | Annotation of CRISPR arrays | | AMRFinderPlus | 3.11.4 | Antimicrobial resistance gene annotation; virulence factors, biocide, heat, acid, and metal resistance gene annotation | | AMRFinderPlus DB | 3.11 2023-02-23.1 | Database for AMRFinderPlus | | antiSMASH | 7.1.0 | Biosynthetic gene cluster annotation | | GECCO | 0.9.8 | Biosynthetic gene cluster annotation | | SanntiS | 0.9.3.2 | Biosynthetic gene cluster annotation | | DefenseFinder | 1.2.0 | Annotation of anti-phage systems | | DefenseFinder models | 1.2.3 | Database for DefenseFinder | | run_dbCAN | 4.1.2 | Polysaccharide utilization loci prediction | | dbCAN DB | V12 | Database for run_dbCAN | | Infernal | 1.1.4 | RNA predictions | | tRNAscan-SE | 2.0.9 | tRNA predictions | | Rfam | 14.9 | Identification of SSU/LSU rRNA and other ncRNAs | | Panaroo | 1.3.2 | Pan-genome computation | | Seqtk | 1.3 | Generating a gene catalogue | | VIRify | 2.0.1 | Viral sequence annotation | | [Mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) | 2.0.2 | Mobilome annotation | | samtools | 1.15 | FASTA indexing | ## Setup ### Environment The pipeline is implemented in [Nextflow](https://www.nextflow.io/). Requirements: - [singulairty](https://sylabs.io/docs/) or [docker](https://www.docker.com/) #### Reference databases The pipeline needs the following reference databases and configuration files (roughtly ~150G): - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/gunc_db_2.0.4.dmnd.gz - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/eggnog_db_5.0.2.tgz - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/rfam_14.9/ - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/kegg_classes.tsv - ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/genomes-pipeline/continent_countries.csv - https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz - ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.11/2023-02-23.1 - https://zenodo.org/records/4626519/files/uniref100.KO.v1.dmnd.gz ### Containers This pipeline requires [singularity](https://sylabs.io/docs/) or [docker](https://www.docker.com/) as the container engine to run pipeline. The containers are hosted in [biocontainers](https://biocontainers.pro/) and [quay.io/microbiome-informatics](https://quay.io/organization/microbiome-informatics) repository. It's possible to build the containers from scratch using the following script: ```bash cd containers && bash build.sh ``` ## Running the pipeline ## Data preparation 1. You need to pre-download your data to directories and make sure that genomes are uncompressed. Scripts to fetch genomes from ENA ([fetch_ena.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ena.py)) and NCBI ([fetch_ncbi.py](https://github.com/EBI-Metagenomics/genomes-pipeline/blob/master/bin/fetch_ncbi.py)) are provided and need to be executed separately from the pipeline. If you have downloaded genomes from both ENA and NCBI, put them into separate folders. 2. When genomes are fetched from ENA using the `fetch_ena.py` script, a CSV file with contamination and completeness statistics is also created in the same directory where genomes are saved to. If you are downloading genomes using a different approach, a CSV file needs to be created manually (each line should be genome accession, % completeness, % contamination). The ENA fetching script also pre-filters genomes to satisfy the QS50 cut-off (QS = % completeness - 5 * % contamination). 3. You will need the following information to run the pipeline: - catalogue name (for example, zebrafish-faecal) - catalogue version (for example, 1.0) - catalogue biome (for example, root:Host-associated:Human:Digestive system:Large intestine:Fecal) - min and max accession number to be assigned to the genomes (only MGnify specific). Max - Min = #total number of genomes (NCBI+ENA) ### Execution The pipeline is built in [Nextflow](https://www.nextflow.io), and utilized containers to run the software (we don't support conda ATM). In order to run the pipeline it's required that the user creates a profile that suits their needs, there is an `ebi` profile in `nexflow.config` that can be used as template. After downloading the databases and adjusting the config file: ```bash nextflow run EBI-Metagenomics/genomes-pipeline -c -profile \\ --genome-prefix=MGYG \\ --biome="root:Host-associated:Fish:Digestive system" \\ --ena_genomes= \\ --ena_genomes_checkm= \\ --mgyg_start=0 \\ --mgyg_end=10 \\ --preassigned_accessions= --catalogue_name=zebrafish-faecal \\ --catalogue_version="1.0" \\ --ftp_name="zebrafish-faecal" \\ --ftp_version="v1.0" \\ --outdir="" ``` ### Development Install development tools (including pre-commit hooks to run Black code formatting). ```bash pip install -r requirements-dev.txt pre-commit install ``` #### Code style Use Black, this tool is configured if you install the pre-commit tools as above. To manually run them: black . ### Testing This repo has 2 set of tests, python unit tests for some of the most critical python scripts and [nf-test](https://github.com/askimed/nf-test) scripts for the nextflow code. To run the python tests ```bash pip install -r requirements-test.txt pytest ``` To run the nextflow ones the databases have to downloaded manually, we are working to improve this. ```bash nf-test test tests/* ``` """ ; ns1:isBasedOn ; ns1:keywords "Metagenomics, Nextflow, Bioinformatics" ; ns1:license ; ns1:name "MGnify genomes catalogue pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-05-02T11:51:45Z"^^ns1:Date ; ns1:dateModified "2023-05-02T11:58:38Z"^^ns1:Date ; ns1:description """# Introduction `katdetectr` is an *R* package for the detection, characterization and visualization of localized hypermutated regions, often referred to as *kataegis*. Please see the [Application Note](https://www.biorxiv.org/content/10.1101/2022.07.11.499364v1) (under submission) for additional background, details and performance evaluations of `katdetectr`. The general workflow of `katdetectr` can be summarized as follows: 1. Import of genomic variants; VCF, MAF or VRanges objects. 2. Detection of kataegis foci. 3. Visualization of segmentation and kataegis foci. Please see the [vignette](https://bioconductor.org/packages/release/bioc/vignettes/katdetectr/inst/doc/General_overview.html) for an overview of the workflow in a step-by-step manner on publicly-available datasets which are included within this package. ## Installation Download katdetectr from BioConductor: ```R if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("katdetectr") ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.463.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Katdetectr" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-12-12T03:15:52Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:54:50Z"^^ns1:Date ; ns1:description """From the R1 and R2 fastq files of a single samples, make a scRNAseq counts matrix, and perform basic QC with scanpy. Then, do further processing by making a UMAP and clustering. Produces a processed AnnData Depreciated: use individual workflows insead for multiple samples""" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq Single Sample Processing STARSolo" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/AnnData - Loaded " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MaxMTpc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MinCountPerCell" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/MinGenesPerCell" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Mitochondrial Prefix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/genecount_qc_plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mito_qc_plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/qc_anndata_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/top_genes_plot" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:51:13Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:51:13Z"^^ns1:Date ; ns1:description "Take an anndata file, and perform basic QC with scanpy. Produces a filtered AnnData object." ; ns1:input , , , , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq_CellQC" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/QCFilteredAnnDataObject" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/1k_cell_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/1k_gene_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Scanpy FindCluster on input dataset(s): Clusters AnnData" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Scanpy RunUMAP on input dataset(s): UMAP object AnnData" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/marker_dot_plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/marker_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/processed_anndata_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/umap_cluster_plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/umap_sample_plot" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:52:35Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:52:35Z"^^ns1:Date ; ns1:description "Basic processing of a QC-filtered Anndata Object. UMAP, clustering e.t.c " ; ns1:input ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq_QCtoBasicProcessing" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "Ruby bioinformatics toolkit" ; ns1:name "Rbbt" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-09T09:43:39Z"^^ns1:Date ; ns1:dateModified "2023-05-23T12:33:53Z"^^ns1:Date ; ns1:description """Rbbt implementation of the Covid-19 pilot workflow from the Personalized Medicine Center of Excellence. This workflow processes single cell data to personalize boolean models that are then used in a multi-scale cellular simulation using PhysiBoSS.""" ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Covid19 Pilot workflow (Rbbt)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-07-23T18:22:21Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:09Z"^^ns1:Date ; ns1:description "Abstract CWL Automatically generated from the Galaxy workflow file: Workflow with Copernicus Essential Climate Variable - select and plot" ; ns1:image ; ns1:keywords "Galaxy, Climate, copernicus" ; ns1:license ; ns1:name "Copernicus Essential Climate Variable - select and plot" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-11T08:29:41Z"^^ns1:Date ; ns1:dateModified "2024-09-09T08:06:05Z"^^ns1:Date ; ns1:description """Correlation between Phenotypic and In Silico Detection of Antimicrobial Resistance in Salmonella enterica in Canada Using Staramr. Doi: [10.3390/microorganisms10020292](https://doi.org/10.3390/microorganisms10020292) | tool | version | license | | -- | -- | -- | | staramr | 0.8.0 | [Apache-2.0 license](https://github.com/phac-nml/staramr/blob/development/LICENSE) | """ ; ns1:keywords "AMR, AMR-detection, 10.3390/microorganisms10020292, Bioinformatics, antimicrobial resistance" ; ns1:license ; ns1:name "Workflow 4: Staramr" ; ns1:output , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "body" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "is_availability" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "result_modifiers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_file" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-23T13:58:11Z"^^ns1:Date ; ns1:dateModified "2024-11-12T12:55:51Z"^^ns1:Date ; ns1:description """# rquest-omop-worker-workflows Source for workflow definitions for the open source RQuest OMOP Worker tool developed for Hutch/TRE-FX Note: ARM workflows are currently broken. x86 ones work. ## Inputs ### Body Sample input payload: ```json { "task_id": "job-2023-01-13-14: 20: 38-", "project": "", "owner": "", "cohort": { "groups": [ { "rules": [ { "varname": "OMOP", "varcat": "Person", "type": "TEXT", "oper": "=", "value": "8507" } ], "rules_oper": "AND" } ], "groups_oper": "OR" }, "collection": "", "protocol_version": "", "char_salt": "", "uuid": "" } ``` ### Database access Currently this workflow requires inputs for connecting to the database it will run queries against. In future this may be moved to environment variables.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.471.3" ; ns1:image ; ns1:input , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "rquest-omop-worker-workflow" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CPAT_header_tab" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GRCh38_p13_genome_fa_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam-A_hmm_dat_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam-A_hmm_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "active_site_dat_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gencode_v43_annotation_gtf_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gencode_v43_lncRNA_transcripts_fa_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gencode_v43_pc_transcripts_fa_gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gencode_v43_transcripts_fa_gz" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-05-25T21:57:53Z"^^ns1:Date ; ns1:dateModified "2023-05-25T21:58:58Z"^^ns1:Date ; ns1:description "This workflow correspond to the Genome-wide alternative splicing analysis training. It allows to analyze isoform switching by making use of IsoformSwitchAnalyzeR." ; ns1:image ; ns1:input , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Transcriptomics, alternative-splicing, GTN" ; ns1:license ; ns1:name "Genome-wide alternative splicing analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:alternateName "UNICORE" ; ns1:name "Uniform Interface to Computing Resources" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-17T12:08:56Z"^^ns1:Date ; ns1:dateModified "2023-06-07T13:15:20Z"^^ns1:Date ; ns1:description """The radiation source ELBE (Electron Linac for beams with high Brilliance and low Emittance) at the Helmholtz Centre Dresden Rossendorf (HZDR) can produce several kinds of secondary radiations. THz radiation is one of them and can be used with a typical pulse frequency of 100 kHz as a stimulation source for elementary low-energy degrees of freedom in matter. To sample the whole THz wave the laser path length is modified by moving specific mirrors. The raw data contains for each mirror position a binary file storing the signal spectra and a folder with gray scaled tiff files storing the jitter timing. This Workflow is equivalent to the first part of the standalone jupyter notebook https://github.com/hzdr/TELBE-raw-data-evaluation/blob/main/sorting_binning.ipynb In the job file the folder < FOLDER_BASE> and < FOLDER_SUB> needs to be specified and the parameters as a json string like < PARAMS> = { "rep": 100000, "t_exp": 1, "N_sample": 96, "offset": 0, "pixel_to_ps": 0.0115, "Stage_zero": 0 } The python file which is used is originally published in gitlab https://codebase.helmholtz.cloud/science2workflow/telbe-sorting-binning/-/blob/master/src/ The workflow can automatically be monitored in Heliport if the project number < HELIPORT_PROJECT> is provided. """ ; ns1:keywords "" ; ns1:license ; ns1:name "Sorting and registration of Terahertz ELBE raw data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-06-21T10:41:38Z"^^ns1:Date ; ns1:dateModified "2023-06-21T10:41:38Z"^^ns1:Date ; ns1:description """# CWL-assembly [![Codacy Badge](https://api.codacy.com/project/badge/Grade/684724bbc0134960ab41748f4a4b732f)](https://www.codacy.com/app/mb1069/CWL-assembly?utm_source=github.com&utm_medium=referral&utm_content=EBI-Metagenomics/CWL-assembly&utm_campaign=Badge_Grade) [![Build Status](https://travis-ci.org/EBI-Metagenomics/CWL-assembly.svg?branch=develop)](https://travis-ci.org/EBI-Metagenomics/CWL-assembly) ## Description This repository contains two workflows for metagenome and metatranscriptome assembly of short read data. MetaSPAdes is used as default for paired-end data, and MEGAHIT for single-end data and co-assemblies. MEGAHIT can be specified as the default assembler in the yaml file if preferred. Steps include: * _QC_: removal of short reads, low quality regions, adapters and host decontamination * _Assembly_: with metaSPADES or MEGAHIT * _Post-assembly_: Host and PhiX decontamination, contig length filter (500bp), stats generation ## Requirements - How to install This pipeline requires a conda environment with cwltool, blastn, and metaspades. If created with `requirements.yml`, the environment will be called `cwl_assembly`. ``` conda env create -f requirements.yml conda activate cwl_assembly pip install cwltool==3.1.20230601100705 ``` ## Databases You will need to pre-download fasta files for host decontamination and generate the following databases accordingly: * bwa index * blast index Specify the locations in the yaml file when running the pipeline. ## Main pipeline executables * `src/workflows/metagenome_pipeline.cwl` * `src/workflows/metatranscriptome_pipeline.cwl` ## Example command ```cwltool --singularity --outdir ${OUTDIR} ${CWL} ${YML}``` `$CWL` is going to be one of the executables mentioned above `$YML` should be a config yaml file including entries among what follows. You can find a yml template in the `examples` folder. ## Example output directory structure ``` Root directory ├── megahit │ └── 001 -------------------------------- Assembly root directory │ ├── assembly_stats.json ------------ Human-readable assembly stats file │ ├── coverage.tab ------------------- Coverage file │ ├── log ---------------------------- CwlToil+megahit output log | ├── options.json ------------------- Megahit input options │ ├── SRR6257420.fasta.gz ------------ Archived and trimmed assembly │ └── SRR6257420.fasta.gz.md5 -------- MD5 hash of above archive ├── metaspades │ └── 001 -------------------------------- Assembly root directory │ ├── assembly_graph.fastg ----------- Assembly graph │ ├── assembly_stats.json ------------ Human-readable assembly stats file │ ├── coverage.tab ------------------- Coverage file | ├── params.txt --------------------- Metaspades input options │ ├── spades.log --------------------- Metaspades output log │ ├── SRR6257420.fasta.gz ------------ Archived and trimmed assembly │ └── SRR6257420.fasta.gz.md5 -------- MD5 hash of above archive │  └── raw ------------------------------------ Raw data directory ├── SRR6257420.fastq.qc_stats.tsv ------ Stats for cleaned fastq ├── SRR6257420_fastp_clean_1.fastq.gz -- Cleaned paired-end file_1 └── SRR6257420_fastp_clean_2.fastq.gz -- Cleaned paired-end file_2 ``` """ ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Metagenome and metatranscriptome assembly in CWL" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-19T14:02:24Z"^^ns1:Date ; ns1:dateModified "2023-05-19T14:02:24Z"^^ns1:Date ; ns1:description """# EukRecover Pipeline to recover eukaryotic MAGs using CONCOCT, metaBAT2 and EukCC's merging algorythm. Needs paired end shotgun metagenomic reads. ## Environment Eukrecover requires an environment with snakemake and metaWRAP. ## Quickstart Define your samples in the file `samples.csv`. This file needs to have the columns project and run to identify each metagenome. This pipeline does not support co-binning, but feel free to change it. Clone this repro wherever you want to run the pipeline: ``` git clone https://github.com/openpaul/eukrecover/ ``` You can then run the snakemake like so ``` snakemake --use-singularity ``` The pipeline used dockerhub to fetch all tools, so make sure you have singularity installed. ## Prepare databases The pipeline will setup databases for you, but if you already have a EukCC or a BUSCO 5 database you can use them by specifying the location in the file `config/config.yaml` ## Output: In the folder results you will find a folder `MAGs` which will contain a folder `fa` containing the actual MAG fastas. In addition you will find stats for each MAG in the table `QC.csv`. This table contains the following columns: name,eukcc_compl,eukcc_cont,BUSCO_C,BUSCO_M,BUSCO_D,BUSCO_F,BUSCO_tax,N50,bp ## Citation: If you use this pipeline please make sure to cite all used software. For this please reffer to the used rules. """ ; ns1:keywords "" ; ns1:license ; ns1:name "EukRecover" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-23T12:07:56Z"^^ns1:Date ; ns1:dateModified "2023-05-23T12:33:23Z"^^ns1:Date ; ns1:description """# COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow ## Table of Contents - [COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow](#covid-19-multiscale-modelling-of-the-virus-and-patients-tissue-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [Mahti or Puhti](#mahti-or-puhti) - [Requirements](#requirements) - [Steps](#steps) - [License](#license) - [Contact](#contact) ## Description Uses multiscale simulations to predict patient-specific SARS‑CoV‑2 severity subtypes (moderate, severe or control), using single-cell RNA-Seq data, MaBoSS and PhysiBoSS. Boolean models are used to determine the behaviour of individual agents as a function of extracellular conditions and the concentration of different substrates, including the number of virions. Predictions of severity subtypes are based on a meta-analysis of personalised model outputs simulating cellular apoptosis regulation in epithelial cells infected by SARS‑CoV‑2. The workflow uses the following building blocks, described in order of execution: 1. High-throughput mutant analysis 2. Single-cell processing 3. Personalise patient 4. PhysiBoSS 5. Analysis of all simulations For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the COVID-19 Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs and Snakemake (in progress). ### Resources The ``Resources`` folder contains dataset files. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for the given small dataset. They can be executed individually for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the COVID19 Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) / [Snakemake](https://snakemake.readthedocs.io/en/stable/) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/covid-19-workflow.git ``` 2. Install the Building Blocks required for the COVID19 Workflow: ```bash covid-19-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - MaBoSS.singularity - meta_analysis.singularity - PhysiCell-COVID19.singularity - single_cell.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build MaBoSS.sif MaBoSS.singularity sudo singularity build meta_analysis.sif meta_analysis.singularity sudo singularity build PhysiCell-COVID19.sif PhysiCell-COVID19.singularity sudo singularity build single_cell.sif single_cell.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` **If using Snakemake in local PC** (make sure that SnakeMake is installed): 4. Go to `Workflow/SnakeMake` folder ```bash cd Workflows/SnakeMake ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the COVID19 Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the COVID19 Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`COVID19WORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_covid19workflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with COVID19 Workflow results. ### Mahti or Puhti This section explains how to run the COVID19 workflow on CSC supercomputers using SnakeMake. #### Requirements - Install snakemake (or check if there is a version installed using `module spider snakemake`) - Install workflow, using the same steps as for the local machine. With the exception that containers have to be built elsewhere. #### Steps 1. Go to `Workflow/SnakeMake` folder ```bash cd Workflow/SnakeMake ``` 2. Edit `launch.sh` with the correct partition, account, and resource specifications. 3. Execute `./launch.sh` > :warning: Snakemake provides a `--cluster` flag, but this functionality should be avoided as it's really not suited for HPC systems. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Covid19 Pilot workflow (PyCOMPSs)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-23T12:10:39Z"^^ns1:Date ; ns1:dateModified "2023-05-23T12:34:23Z"^^ns1:Date ; ns1:description """# Cancer Invasion Workflow ## Table of Contents - [Cancer Invasion Workflow](#cancer-invasion-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [Mahti or Puhti](#mahti-or-puhti) - [Requirements](#requirements) - [Steps](#steps) - [License](#license) - [Contact](#contact) ## Description Uses multiscale simulations to describe cancer progression into invasion. The workflow uses the following building blocks, described in order of execution: 1. PhysiBoSS-Invasion For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the Cancer Invasion Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs and Snakemake (in progress). ### Resources The ``Resources`` folder contains dataset files. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for the given small dataset. They can be executed individually for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the Cancer Invasion Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) / [Snakemake](https://snakemake.readthedocs.io/en/stable/) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/cancer-invasion-workflow ``` 2. Install the Building Blocks required for the Cancer Invasion Workflow: ```bash cancer-invasion-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - PhysiCell-Invasion.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build PhysiCell-Invasion.sif PhysiCell-Invasion.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` **If using Snakemake in local PC** (make sure that SnakeMake is installed): 4. Go to `Workflow/SnakeMake` folder ```bash cd Workflows/SnakeMake ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the Cancer Invasion Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the Cancer Invasion Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`CANCERINVASIONWORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_cancerinvasionworkflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with Cancer Invasion Workflow results. ### Mahti or Puhti This section explains how to run the Cancer Invasion workflow on CSC supercomputers using SnakeMake. #### Requirements - Install snakemake (or check if there is a version installed using `module spider snakemake`) - Install workflow, using the same steps as for the local machine. With the exception that containers have to be built elsewhere. #### Steps 1. Go to `Workflow/SnakeMake` folder ```bash cd Workflow/SnakeMake ``` 2. Edit `launch.sh` with the correct partition, account, and resource specifications. 3. Execute `./launch.sh` > :warning: Snakemake provides a `--cluster` flag, but this functionality should be avoided as it's really not suited for HPC systems. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Cancer Invasion" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-23T12:15:44Z"^^ns1:Date ; ns1:dateModified "2023-05-23T12:32:48Z"^^ns1:Date ; ns1:description """# Single drug prediction Workflow ## Table of Contents - [Single drug prediction Workflow](#single-drug-prediction-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [License](#license) - [Contact](#contact) ## Description Complementarily, the workflow supports single drug response predictions to provide a baseline prediction in cases where drug response information for a given drug and cell line is not available. As an input, the workflow needs basal gene expression data for a cell, the drug targets (they need to be known for untested drugs) and optionally CARNIVAL features (sub-network activity predicted with CARNIVAL building block) and predicts log(IC50) values. This workflow uses a custom matrix factorization approach built with Google JAX and trained with gradient descent. The workflow can be used both for training a model, and for predicting new drug responses. The workflow uses the following building blocks in order of execution (for training a model): 1. Carnival_gex_preprocess - Preprocessed the basal gene expression data from GDSC. The input is a matrix of Gene x Sample expression data. 2. Progeny - Using the preprocessed data, it estimates pathway activities for each column in the data (for each sample). It returns a matrix of Pathways x Samples with activity values for 11 pathways. 3. Omnipath - It downloads latest Prior Knowledge Network of signalling. This building block can be ommited if there exists already a csv file with the network. 4. TF Enrichment - For each sample, transcription factor activities are estimated using Dorothea. 5. CarnivalPy - Using the TF activities estimated before, it runs Carnival to obtain a sub-network consistent with the TF activities (for each sample). 6. Carnival_feature_merger - Preselect a set of genes by the user (if specified) and merge the features with the basal gene expression data. 7. ML Jax Drug Prediction - Trains a model using the combined features to predict IC50 values from GDSC. For details on individual workflow steps, please check the scripts that use each individual building block in the workflow [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the Single Drug Prediction Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs. ### Resources The ``Resources`` folder contains a small dataset for testing purposes. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for a small dataset. They can be executed individually *without PyCOMPSs installed* for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the Single Drug Prediction Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/single-drug-prediction-workflow.git ``` 2. Install the Building Blocks required for the COVID19 Workflow: ```bash single-drug-prediction-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - toolset.singularity - carnivalpy.singularity - ml-jax.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images ## Download new BB singularity files wget https://github.com/saezlab/permedcoe/archive/refs/heads/master.zip unzip master.zip cd permedcoe-master/containers ## Build containers cd toolset sudo /usr/local/bin/singularity build toolset.sif toolset.singularity mv toolset.sif ../../../ cd .. cd carnivalpy sudo /usr/local/bin/singularity build carnivalpy.sif carnivalpy.singularity mv carnivalpy.sif ../../../ cd .. cd ml-jax sudo /usr/local/bin/singularity build ml-jax.sif ml-jax.singularity mv ml-jax.sif ../../../tf-jax.sif cd .. cd ../.. ## Cleanup rm -rf permedcoe-master rm master.zip cd ../../.. ``` > :warning: **TIP**: The singularity containers **can to be downloaded** from: https://cloud.sylabs.io/library/pablormier **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` The execution is prepared to use the singularity images that **MUST** be placed into `BuildingBlocks/Resources/images` folder. If they are located in any other folder, please update the `run.sh` script setting the `PERMEDCOE_IMAGES` to the images folder. > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the Single Drug Prediction Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the Single Drug Prediction Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`SINGLE_DRUG_PREDICTION_WORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_single_drug_prediction_workflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with Single Drug Prediction Workflow results. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Single Drug Prediction" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-05-23T12:36:45Z"^^ns1:Date ; ns1:dateModified "2023-05-23T12:36:45Z"^^ns1:Date ; ns1:description """# Drug Synergies Screening Workflow ## Table of Contents - [Drug Synergies Screening Workflow](#drug-synergies-screening-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [License](#license) - [Contact](#contact) ## Description This pipeline simulates a drug screening on personalised cell line models. It automatically builds Boolean models of interest, then uses cell lines data (expression, mutations, copy number variations) to personalise them as MaBoSS models. Finally, this pipeline simulates multiple drug intervention on these MaBoSS models, and lists drug synergies of interest. The workflow uses the following building blocks, described in order of execution: 1. Build model from species 2. Personalise patient 3. MaBoSS 4. Print drug results For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`](https://github.com/PerMedCoE/drug-synergies-workflow>) ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the Drug Synergies Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs. ### Resources The ``Resources`` folder contains a small dataset for testing purposes. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for a small dataset. They can be executed individually *without PyCOMPSs installed* for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the Drug Synergies Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/drug-synergies-workflow.git ``` 2. Install the Building Blocks required for the COVID19 Workflow: ```bash drug-synergies-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - PhysiCell-COVID19.singularity - printResults.singularity - MaBoSS_sensitivity.singularity - FromSpeciesToMaBoSSModel.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build PhysiCell-COVID19.sif PhysiCell-COVID19.singularity sudo singularity build printResults.sif printResults.singularity sudo singularity build MaBoSS_sensitivity.sif MaBoSS_sensitivity.singularity sudo singularity build FromSpeciesToMaBoSSModel.sif FromSpeciesToMaBoSSModel.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the Drug Synergies Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the Drug Synergies Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`DRUG_SYNERGIES_WORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_drug_synergies_workflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with Drug Synergies Workflow results. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Drug Synergy" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-07-23T18:33:44Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:15Z"^^ns1:Date ; ns1:description """Description: SSP-based RCP scenario with high radiative forcing by the end of century. Following approximately RCP8.5 global forcing pathway with SSP5 socioeconomic conditions. Concentration-driven. Rationale: the scenario represents the high end of plausible future pathways. SSP5 is the only SSP with emissions high enough to produce the 8.5 W/m2 level of forcing in 2100. This workflow is answering to the following scientific question: - Is it worth investing in artificial snowmaking equipment at RATECE-PLANICA?""" ; ns1:keywords "Climate, jupyter, cmip6" ; ns1:license ; ns1:name "RATECE-PLANICA ski station (Slovenia) under CMIP-6 SSP585 condition" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Henrik Nortamo" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-05-23T13:23:14Z"^^ns1:Date ; ns1:dateModified "2023-05-23T13:23:14Z"^^ns1:Date ; ns1:description """# COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow ## Table of Contents - [COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow](#covid-19-multiscale-modelling-of-the-virus-and-patients-tissue-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [Mahti or Puhti](#mahti-or-puhti) - [Requirements](#requirements) - [Steps](#steps) - [License](#license) - [Contact](#contact) ## Description Uses multiscale simulations to predict patient-specific SARS‑CoV‑2 severity subtypes (moderate, severe or control), using single-cell RNA-Seq data, MaBoSS and PhysiBoSS. Boolean models are used to determine the behaviour of individual agents as a function of extracellular conditions and the concentration of different substrates, including the number of virions. Predictions of severity subtypes are based on a meta-analysis of personalised model outputs simulating cellular apoptosis regulation in epithelial cells infected by SARS‑CoV‑2. The workflow uses the following building blocks, described in order of execution: 1. High-throughput mutant analysis 2. Single-cell processing 3. Personalise patient 4. PhysiBoSS 5. Analysis of all simulations For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the COVID-19 Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs and Snakemake (in progress). ### Resources The ``Resources`` folder contains dataset files. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for the given small dataset. They can be executed individually for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the COVID19 Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) / [Snakemake](https://snakemake.readthedocs.io/en/stable/) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/covid-19-workflow.git ``` 2. Install the Building Blocks required for the COVID19 Workflow: ```bash covid-19-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - MaBoSS.singularity - meta_analysis.singularity - PhysiCell-COVID19.singularity - single_cell.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build MaBoSS.sif MaBoSS.singularity sudo singularity build meta_analysis.sif meta_analysis.singularity sudo singularity build PhysiCell-COVID19.sif PhysiCell-COVID19.singularity sudo singularity build single_cell.sif single_cell.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` **If using Snakemake in local PC** (make sure that SnakeMake is installed): 4. Go to `Workflow/SnakeMake` folder ```bash cd Workflows/SnakeMake ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the COVID19 Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the COVID19 Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`COVID19WORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_covid19workflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with COVID19 Workflow results. ### Mahti or Puhti This section explains how to run the COVID19 workflow on CSC supercomputers using SnakeMake. #### Requirements - Install snakemake (or check if there is a version installed using `module spider snakemake`) - Install workflow, using the same steps as for the local machine. With the exception that containers have to be built elsewhere. #### Steps 1. Go to `Workflow/SnakeMake` folder ```bash cd Workflow/SnakeMake ``` 2. Edit `launch.sh` with the correct partition, account, and resource specifications. 3. Execute `./launch.sh` > :warning: Snakemake provides a `--cluster` flag, but this functionality should be avoided as it's really not suited for HPC systems. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Covid19 Pilot workflow (Nextflow)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Henrik Nortamo" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-05-23T13:24:53Z"^^ns1:Date ; ns1:dateModified "2023-05-23T13:24:53Z"^^ns1:Date ; ns1:description """# COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow ## Table of Contents - [COVID-19 Multiscale Modelling of the Virus and Patients’ Tissue Workflow](#covid-19-multiscale-modelling-of-the-virus-and-patients-tissue-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [Mahti or Puhti](#mahti-or-puhti) - [Requirements](#requirements) - [Steps](#steps) - [License](#license) - [Contact](#contact) ## Description Uses multiscale simulations to predict patient-specific SARS‑CoV‑2 severity subtypes (moderate, severe or control), using single-cell RNA-Seq data, MaBoSS and PhysiBoSS. Boolean models are used to determine the behaviour of individual agents as a function of extracellular conditions and the concentration of different substrates, including the number of virions. Predictions of severity subtypes are based on a meta-analysis of personalised model outputs simulating cellular apoptosis regulation in epithelial cells infected by SARS‑CoV‑2. The workflow uses the following building blocks, described in order of execution: 1. High-throughput mutant analysis 2. Single-cell processing 3. Personalise patient 4. PhysiBoSS 5. Analysis of all simulations For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the COVID-19 Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs and Snakemake (in progress). ### Resources The ``Resources`` folder contains dataset files. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for the given small dataset. They can be executed individually for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the COVID19 Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) / [Snakemake](https://snakemake.readthedocs.io/en/stable/) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/covid-19-workflow.git ``` 2. Install the Building Blocks required for the COVID19 Workflow: ```bash covid-19-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - MaBoSS.singularity - meta_analysis.singularity - PhysiCell-COVID19.singularity - single_cell.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build MaBoSS.sif MaBoSS.singularity sudo singularity build meta_analysis.sif meta_analysis.singularity sudo singularity build PhysiCell-COVID19.sif PhysiCell-COVID19.singularity sudo singularity build single_cell.sif single_cell.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` **If using Snakemake in local PC** (make sure that SnakeMake is installed): 4. Go to `Workflow/SnakeMake` folder ```bash cd Workflows/SnakeMake ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the COVID19 Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the COVID19 Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`COVID19WORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_covid19workflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with COVID19 Workflow results. ### Mahti or Puhti This section explains how to run the COVID19 workflow on CSC supercomputers using SnakeMake. #### Requirements - Install snakemake (or check if there is a version installed using `module spider snakemake`) - Install workflow, using the same steps as for the local machine. With the exception that containers have to be built elsewhere. #### Steps 1. Go to `Workflow/SnakeMake` folder ```bash cd Workflow/SnakeMake ``` 2. Edit `launch.sh` with the correct partition, account, and resource specifications. 3. Execute `./launch.sh` > :warning: Snakemake provides a `--cluster` flag, but this functionality should be avoided as it's really not suited for HPC systems. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Covid19 Pilot workflow (Snakemake)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Active sites dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CPAT header" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Control IDs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genome annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam-A HMM Stockholm file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pfam-A HMM library" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "RNA-seq data collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-06-11T19:17:35Z"^^ns1:Date ; ns1:dateModified "2023-06-11T19:17:56Z"^^ns1:Date ; ns1:description "Genome-wide alternative splicing analysis v.2" ; ns1:image ; ns1:input , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Transcriptomics, Alternative splicing, isoform switching" ; ns1:license ; ns1:name "Genome-wide alternative splicing analysis v.2" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genome annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-05-25T23:15:06Z"^^ns1:Date ; ns1:dateModified "2023-05-25T23:15:06Z"^^ns1:Date ; ns1:description "Abstract CWL Automatically generated from the Galaxy workflow file: Copy of Genome-wide alternative splicing analysis" ; ns1:image ; ns1:input , ; ns1:keywords "" ; ns1:license ; ns1:name "StringTie workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing)>, ; ns1:dateCreated "2023-05-30T07:23:13Z"^^ns1:Date ; ns1:dateModified "2023-10-28T07:59:47Z"^^ns1:Date ; ns1:description """**Name:** Matrix Multiplication **Contact Person:** support-compss@bsc.es **Access Level:** public **License Agreement:** Apache2 **Platform:** COMPSs # Description Matrix multiplication is a binary operation that takes a pair of matrices and produces another matrix. If A is an n×m matrix and B is an m×p matrix, the result AB of their multiplication is an n×p matrix defined only if the number of columns m in A is equal to the number of rows m in B. When multiplying A and B, the elements of the rows in A are multiplied with corresponding columns in B. In this implementation, A and B are square matrices (same number of rows and columns), and so it is the result matrix C. Each matrix is divided in N blocks of M doubles. The multiplication of two blocks is done by a multiply task method with a simple three-nested-loop implementation. When executed with COMPSs, the main program generates N^3^ tasks arranged as N^2^ chains of N tasks in the dependency graph. # Versions There are three versions of Matrix Multiplication, depending on the data types used to store the blocks. ## Version 1 ''files'', where the matrix blocks are stored in files. ## Version 2 ''objects'', where the matrix blocks are represented by objects. ## Version 3 ''arrays'', where the matrix blocks are stored in arrays. # Execution instructions Usage: ``` runcompss matmul.files.Matmul numberOfBlocks blockSize runcompss matmul.objects.Matmul numberOfBlocks blockSize runcompss matmul.arrays.Matmul numberOfBlocks blockSize ``` where: * numberOfBlocks: Number of blocks inside each matrix * blockSize: Size of each block # Execution Example ``` runcompss matmul.objects.Matmul 16 4 runcompss matmul.files.Matmul 16 4 runcompss matmul.arrays.Matmul 16 4 ``` # Build ## Option 1: Native java ``` cd ~/tutorial_apps/java/matmul/; javac src/main/java/matmul/*/*.java cd src/main/java/; jar cf matmul.jar matmul/ cd ../../../; mv src/main/java/matmul.jar jar/ ``` ## Option 2: Maven ``` cd ~/tutorial_apps/java/matmul/ mvn clean package ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.484.1" ; ns1:image ; ns1:keywords "Java, COMPSs, Tutorial, Example, Laptop, data_persistence" ; ns1:license ; ns1:name "Java COMPSs Matrix Multiplication, out-of-core, using files" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-05-30T08:55:07Z"^^ns1:Date ; ns1:dateModified "2023-10-27T12:19:50Z"^^ns1:Date ; ns1:description """**Name:** Matrix multiplication with Files **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Matrix multiplication is a binary operation that takes a pair of matrices and produces another matrix. If A is an n×m matrix and B is an m×p matrix, the result AB of their multiplication is an n×p matrix defined only if the number of columns m in A is equal to the number of rows m in B. When multiplying A and B, the elements of the rows in A are multiplied with corresponding columns in B. In this implementation, A and B are square matrices (same number of rows and columns), and so it is the result matrix C. Each matrix is divided in N blocks of M doubles. The multiplication of two blocks is done by a multiply task method with a simple three-nested-loop implementation. When executed with COMPSs, the main program generates N^3^ tasks arranged as N^2^ chains of N tasks in the dependency graph. # Execution instructions Usage: ``` runcompss --lang=python src/matmul_files.py numberOfBlocks blockSize ``` where: * numberOfBlocks: Number of blocks inside each matrix * blockSize: Size of each block # Execution Examples ``` runcompss --lang=python src/matmul_files.py 4 4 runcompss src/matmul_files.py 4 4 python -m pycompss src/matmul_files.py 4 4 ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.485.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Tutorial, Example, Laptop, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Matrix Multiplication, out-of-core, using files" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T10:18:11Z"^^ns1:Date ; ns1:dateModified "2026-03-25T10:37:39Z"^^ns1:Date ; ns1:description """# Protein Conformational ensembles generation ## Workflow included in the [ELIXIR 3D-Bioinfo](https://elixir-europe.org/communities/3d-bioinfo) Implementation Study: ### Building on PDBe-KB to chart and characterize the conformation landscape of native proteins This tutorial aims to illustrate the process of generating **protein conformational ensembles** from** 3D structures **and analysing its **molecular flexibility**, step by step, using the **BioExcel Building Blocks library (biobb)**. ## Conformational landscape of native proteins **Proteins** are **dynamic** systems that adopt multiple **conformational states**, a property essential for many **biological processes** (e.g. binding other proteins, nucleic acids, small molecule ligands, or switching between functionaly active and inactive states). Characterizing the different **conformational states** of proteins and the transitions between them is therefore critical for gaining insight into their **biological function** and can help explain the effects of genetic variants in **health** and **disease** and the action of drugs. **Structural biology** has become increasingly efficient in sampling the different **conformational states** of proteins. The **PDB** has currently archived more than **170,000 individual structures**, but over two thirds of these structures represent **multiple conformations** of the same or related protein, observed in different crystal forms, when interacting with other proteins or other macromolecules, or upon binding small molecule ligands. Charting this conformational diversity across the PDB can therefore be employed to build a useful approximation of the **conformational landscape** of native proteins. A number of resources and **tools** describing and characterizing various often complementary aspects of protein **conformational diversity** in known structures have been developed, notably by groups in Europe. These tools include algorithms with varying degree of sophistication, for aligning the 3D structures of individual protein chains or domains, of protein assemblies, and evaluating their degree of **structural similarity**. Using such tools one can **align structures pairwise**, compute the corresponding **similarity matrix**, and identify ensembles of **structures/conformations** with a defined **similarity level** that tend to recur in different PDB entries, an operation typically performed using **clustering** methods. Such workflows are at the basis of resources such as **CATH, Contemplate, or PDBflex** that offer access to **conformational ensembles** comprised of similar **conformations** clustered according to various criteria. Other types of tools focus on differences between **protein conformations**, identifying regions of proteins that undergo large **collective displacements** in different PDB entries, those that act as **hinges or linkers**, or regions that are inherently **flexible**. To build a meaningful approximation of the **conformational landscape** of native proteins, the **conformational ensembles** (and the differences between them), identified on the basis of **structural similarity/dissimilarity** measures alone, need to be **biophysically characterized**. This may be approached at **two different levels**. - At the **biological level**, it is important to link observed **conformational ensembles**, to their **functional roles** by evaluating the correspondence with **protein family classifications** based on sequence information and **functional annotations** in public databases e.g. Uniprot, PDKe-Knowledge Base (KB). These links should provide valuable mechanistic insights into how the **conformational and dynamic properties** of proteins are exploited by evolution to regulate their **biological function**.

- At the **physical level** one needs to introduce **energetic consideration** to evaluate the likelihood that the identified **conformational ensembles** represent **conformational states** that the protein (or domain under study) samples in isolation. Such evaluation is notoriously **challenging** and can only be roughly approximated by using **computational methods** to evaluate the extent to which the observed **conformational ensembles** can be reproduced by algorithms that simulate the **dynamic behavior** of protein systems. These algorithms include the computationally expensive **classical molecular dynamics (MD) simulations** to sample local thermal fluctuations but also faster more approximate methods such as **Elastic Network Models** and **Normal Node Analysis** (NMA) to model low energy **collective motions**. Alternatively, **enhanced sampling molecular dynamics** can be used to model complex types of **conformational changes** but at a very high computational cost. The **ELIXIR 3D-Bioinfo Implementation Study** *Building on PDBe-KB to chart and characterize the conformation landscape of native proteins* focuses on: 1. Mapping the **conformational diversity** of proteins and their homologs across the PDB. 2. Characterize the different **flexibility properties** of protein regions, and link this information to sequence and functional annotation. 3. Benchmark **computational methods** that can predict a biophysical description of protein motions. This notebook is part of the third objective, where a list of **computational resources** that are able to predict **protein flexibility** and **conformational ensembles** have been collected, evaluated, and integrated in reproducible and interoperable workflows using the **BioExcel Building Blocks library**. Note that the list is not meant to be exhaustive, it is built following the expertise of the implementation study partners. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.486.5" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein conformational ensembles generation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T10:31:40Z"^^ns1:Date ; ns1:dateModified "2026-03-25T10:33:53Z"^^ns1:Date ; ns1:description """# Protein Conformational ensembles generation ## Workflow included in the [ELIXIR 3D-Bioinfo](https://elixir-europe.org/communities/3d-bioinfo) Implementation Study: ### Building on PDBe-KB to chart and characterize the conformation landscape of native proteins This tutorial aims to illustrate the process of generating **protein conformational ensembles** from** 3D structures **and analysing its **molecular flexibility**, step by step, using the **BioExcel Building Blocks library (biobb)**. ## Conformational landscape of native proteins **Proteins** are **dynamic** systems that adopt multiple **conformational states**, a property essential for many **biological processes** (e.g. binding other proteins, nucleic acids, small molecule ligands, or switching between functionaly active and inactive states). Characterizing the different **conformational states** of proteins and the transitions between them is therefore critical for gaining insight into their **biological function** and can help explain the effects of genetic variants in **health** and **disease** and the action of drugs. **Structural biology** has become increasingly efficient in sampling the different **conformational states** of proteins. The **PDB** has currently archived more than **170,000 individual structures**, but over two thirds of these structures represent **multiple conformations** of the same or related protein, observed in different crystal forms, when interacting with other proteins or other macromolecules, or upon binding small molecule ligands. Charting this conformational diversity across the PDB can therefore be employed to build a useful approximation of the **conformational landscape** of native proteins. A number of resources and **tools** describing and characterizing various often complementary aspects of protein **conformational diversity** in known structures have been developed, notably by groups in Europe. These tools include algorithms with varying degree of sophistication, for aligning the 3D structures of individual protein chains or domains, of protein assemblies, and evaluating their degree of **structural similarity**. Using such tools one can **align structures pairwise**, compute the corresponding **similarity matrix**, and identify ensembles of **structures/conformations** with a defined **similarity level** that tend to recur in different PDB entries, an operation typically performed using **clustering** methods. Such workflows are at the basis of resources such as **CATH, Contemplate, or PDBflex** that offer access to **conformational ensembles** comprised of similar **conformations** clustered according to various criteria. Other types of tools focus on differences between **protein conformations**, identifying regions of proteins that undergo large **collective displacements** in different PDB entries, those that act as **hinges or linkers**, or regions that are inherently **flexible**. To build a meaningful approximation of the **conformational landscape** of native proteins, the **conformational ensembles** (and the differences between them), identified on the basis of **structural similarity/dissimilarity** measures alone, need to be **biophysically characterized**. This may be approached at **two different levels**. - At the **biological level**, it is important to link observed **conformational ensembles**, to their **functional roles** by evaluating the correspondence with **protein family classifications** based on sequence information and **functional annotations** in public databases e.g. Uniprot, PDKe-Knowledge Base (KB). These links should provide valuable mechanistic insights into how the **conformational and dynamic properties** of proteins are exploited by evolution to regulate their **biological function**.

- At the **physical level** one needs to introduce **energetic consideration** to evaluate the likelihood that the identified **conformational ensembles** represent **conformational states** that the protein (or domain under study) samples in isolation. Such evaluation is notoriously **challenging** and can only be roughly approximated by using **computational methods** to evaluate the extent to which the observed **conformational ensembles** can be reproduced by algorithms that simulate the **dynamic behavior** of protein systems. These algorithms include the computationally expensive **classical molecular dynamics (MD) simulations** to sample local thermal fluctuations but also faster more approximate methods such as **Elastic Network Models** and **Normal Node Analysis** (NMA) to model low energy **collective motions**. Alternatively, **enhanced sampling molecular dynamics** can be used to model complex types of **conformational changes** but at a very high computational cost. The **ELIXIR 3D-Bioinfo Implementation Study** *Building on PDBe-KB to chart and characterize the conformation landscape of native proteins* focuses on: 1. Mapping the **conformational diversity** of proteins and their homologs across the PDB. 2. Characterize the different **flexibility properties** of protein regions, and link this information to sequence and functional annotation. 3. Benchmark **computational methods** that can predict a biophysical description of protein motions. This notebook is part of the third objective, where a list of **computational resources** that are able to predict **protein flexibility** and **conformational ensembles** have been collected, evaluated, and integrated in reproducible and interoperable workflows using the **BioExcel Building Blocks library**. Note that the list is not meant to be exhaustive, it is built following the expertise of the implementation study partners. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.487.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein conformational ensembles generation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_ndx_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pcz_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pcz_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_gro_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_rmsd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_bfactor_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T10:20:45Z"^^ns1:Date ; ns1:dateModified "2026-03-25T10:31:13Z"^^ns1:Date ; ns1:description """# Protein Conformational ensembles generation ## Workflow included in the [ELIXIR 3D-Bioinfo](https://elixir-europe.org/communities/3d-bioinfo) Implementation Study: ### Building on PDBe-KB to chart and characterize the conformation landscape of native proteins This tutorial aims to illustrate the process of generating **protein conformational ensembles** from** 3D structures **and analysing its **molecular flexibility**, step by step, using the **BioExcel Building Blocks library (biobb)**. ## Conformational landscape of native proteins **Proteins** are **dynamic** systems that adopt multiple **conformational states**, a property essential for many **biological processes** (e.g. binding other proteins, nucleic acids, small molecule ligands, or switching between functionaly active and inactive states). Characterizing the different **conformational states** of proteins and the transitions between them is therefore critical for gaining insight into their **biological function** and can help explain the effects of genetic variants in **health** and **disease** and the action of drugs. **Structural biology** has become increasingly efficient in sampling the different **conformational states** of proteins. The **PDB** has currently archived more than **170,000 individual structures**, but over two thirds of these structures represent **multiple conformations** of the same or related protein, observed in different crystal forms, when interacting with other proteins or other macromolecules, or upon binding small molecule ligands. Charting this conformational diversity across the PDB can therefore be employed to build a useful approximation of the **conformational landscape** of native proteins. A number of resources and **tools** describing and characterizing various often complementary aspects of protein **conformational diversity** in known structures have been developed, notably by groups in Europe. These tools include algorithms with varying degree of sophistication, for aligning the 3D structures of individual protein chains or domains, of protein assemblies, and evaluating their degree of **structural similarity**. Using such tools one can **align structures pairwise**, compute the corresponding **similarity matrix**, and identify ensembles of **structures/conformations** with a defined **similarity level** that tend to recur in different PDB entries, an operation typically performed using **clustering** methods. Such workflows are at the basis of resources such as **CATH, Contemplate, or PDBflex** that offer access to **conformational ensembles** comprised of similar **conformations** clustered according to various criteria. Other types of tools focus on differences between **protein conformations**, identifying regions of proteins that undergo large **collective displacements** in different PDB entries, those that act as **hinges or linkers**, or regions that are inherently **flexible**. To build a meaningful approximation of the **conformational landscape** of native proteins, the **conformational ensembles** (and the differences between them), identified on the basis of **structural similarity/dissimilarity** measures alone, need to be **biophysically characterized**. This may be approached at **two different levels**. - At the **biological level**, it is important to link observed **conformational ensembles**, to their **functional roles** by evaluating the correspondence with **protein family classifications** based on sequence information and **functional annotations** in public databases e.g. Uniprot, PDKe-Knowledge Base (KB). These links should provide valuable mechanistic insights into how the **conformational and dynamic properties** of proteins are exploited by evolution to regulate their **biological function**.

- At the **physical level** one needs to introduce **energetic consideration** to evaluate the likelihood that the identified **conformational ensembles** represent **conformational states** that the protein (or domain under study) samples in isolation. Such evaluation is notoriously **challenging** and can only be roughly approximated by using **computational methods** to evaluate the extent to which the observed **conformational ensembles** can be reproduced by algorithms that simulate the **dynamic behavior** of protein systems. These algorithms include the computationally expensive **classical molecular dynamics (MD) simulations** to sample local thermal fluctuations but also faster more approximate methods such as **Elastic Network Models** and **Normal Node Analysis** (NMA) to model low energy **collective motions**. Alternatively, **enhanced sampling molecular dynamics** can be used to model complex types of **conformational changes** but at a very high computational cost. The **ELIXIR 3D-Bioinfo Implementation Study** *Building on PDBe-KB to chart and characterize the conformation landscape of native proteins* focuses on: 1. Mapping the **conformational diversity** of proteins and their homologs across the PDB. 2. Characterize the different **flexibility properties** of protein regions, and link this information to sequence and functional annotation. 3. Benchmark **computational methods** that can predict a biophysical description of protein motions. This notebook is part of the third objective, where a list of **computational resources** that are able to predict **protein flexibility** and **conformational ensembles** have been collected, evaluated, and integrated in reproducible and interoperable workflows using the **BioExcel Building Blocks library**. Note that the list is not meant to be exhaustive, it is built following the expertise of the implementation study partners. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.488.3" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein conformational ensembles generation" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T09:47:01Z"^^ns1:Date ; ns1:dateModified "2023-06-01T10:57:27Z"^^ns1:Date ; ns1:description """IDR is based on OMERO and thus all what we show in this notebook can be easily adjusted for use against another OMERO server, e.g. your institutional OMERO server instance. The main objective of this notebook is to demonstrate how public resources such as the IDR can be used to train your neural network or validate software tools. The authors of the PLOS Biology paper, "Nessys: A new set of tools for the automated detection of nuclei within intact tissues and dense 3D cultures" published in August 2019: https://doi.org/10.1371/journal.pbio.3000388, considered several image segmenation packages, but they did not use the approach described in this notebook. We will analyse the data using Cellpose and compare the output with the original segmentation produced by the authors. StarDist was not considered by the authors. Our workflow shows how public repository can be accessed and data inside it used to validate software tools or new algorithms. We will use an image (id=6001247) referenced in the paper. The image can be viewed online in the Image Data Resource (IDR). We will use a predefined model from Cellpose as a starting point. Steps to access data from IDR could be re-used if you wish to create a new model (outside the scope of this notebook). ## Launch This notebook uses the [environment_cellpose.yml](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/environment_cellpose.yml) file. See [Setup](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/setup.md).""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.489.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "imaging, Machine Learning, Python" ; ns1:license ; ns1:name "Validate a tool against IDR data: Load Image with labels from IDR, re-analyze using Cellpose" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "concat_traj.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybd_flexserv_bd_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mybd_flexserv_bd_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_disco_bfactor.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_disco_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_disco_traj.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_dist.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_dist.gro" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myconcoord_dist.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_disco_traj.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_bd_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_bd_traj_fitted.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_dmd_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_dmd_traj_fitted.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_nma_ensemble.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_flexserv_nma_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_imods_ensemble.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_mask_backbone" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_mask_ca.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_meta_traj_fitted.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_meta_traj_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_nolb_ensemble.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_nolb_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_pcz_proj1.dcd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_prody_anm_traj.trr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mycpptraj_prody_rms.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mydmd_flexserv_dmd_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mydmd_flexserv_dmd_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_model.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myextract_monomer.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_cluster.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_cluster.xpm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_cluster.xvg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mygmx_concat.cluster.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myimod_imc.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myimod_imode_evecs.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mymake_gmx_ndx.ndx" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mynma_flexserv_nma_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mynma_flexserv_nma_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mynolb_ensemble.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_bfactor_all.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_bfactor_all.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_collectivity.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_evecs.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_hinges_bfactor_report.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_hinges_fcte_report.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_proj1.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_report.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypcz_stiffness.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "myprody_anm_traj.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mytrjcat_concat_traj.trr" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T09:50:55Z"^^ns1:Date ; ns1:dateModified "2023-06-01T09:53:56Z"^^ns1:Date ; ns1:description """# Protein Conformational ensembles generation ## Workflow included in the [ELIXIR 3D-Bioinfo](https://elixir-europe.org/communities/3d-bioinfo) Implementation Study: ### Building on PDBe-KB to chart and characterize the conformation landscape of native proteins This tutorial aims to illustrate the process of generating **protein conformational ensembles** from** 3D structures **and analysing its **molecular flexibility**, step by step, using the **BioExcel Building Blocks library (biobb)**. ## Conformational landscape of native proteins **Proteins** are **dynamic** systems that adopt multiple **conformational states**, a property essential for many **biological processes** (e.g. binding other proteins, nucleic acids, small molecule ligands, or switching between functionaly active and inactive states). Characterizing the different **conformational states** of proteins and the transitions between them is therefore critical for gaining insight into their **biological function** and can help explain the effects of genetic variants in **health** and **disease** and the action of drugs. **Structural biology** has become increasingly efficient in sampling the different **conformational states** of proteins. The **PDB** has currently archived more than **170,000 individual structures**, but over two thirds of these structures represent **multiple conformations** of the same or related protein, observed in different crystal forms, when interacting with other proteins or other macromolecules, or upon binding small molecule ligands. Charting this conformational diversity across the PDB can therefore be employed to build a useful approximation of the **conformational landscape** of native proteins. A number of resources and **tools** describing and characterizing various often complementary aspects of protein **conformational diversity** in known structures have been developed, notably by groups in Europe. These tools include algorithms with varying degree of sophistication, for aligning the 3D structures of individual protein chains or domains, of protein assemblies, and evaluating their degree of **structural similarity**. Using such tools one can **align structures pairwise**, compute the corresponding **similarity matrix**, and identify ensembles of **structures/conformations** with a defined **similarity level** that tend to recur in different PDB entries, an operation typically performed using **clustering** methods. Such workflows are at the basis of resources such as **CATH, Contemplate, or PDBflex** that offer access to **conformational ensembles** comprised of similar **conformations** clustered according to various criteria. Other types of tools focus on differences between **protein conformations**, identifying regions of proteins that undergo large **collective displacements** in different PDB entries, those that act as **hinges or linkers**, or regions that are inherently **flexible**. To build a meaningful approximation of the **conformational landscape** of native proteins, the **conformational ensembles** (and the differences between them), identified on the basis of **structural similarity/dissimilarity** measures alone, need to be **biophysically characterized**. This may be approached at **two different levels**. - At the **biological level**, it is important to link observed **conformational ensembles**, to their **functional roles** by evaluating the correspondence with **protein family classifications** based on sequence information and **functional annotations** in public databases e.g. Uniprot, PDKe-Knowledge Base (KB). These links should provide valuable mechanistic insights into how the **conformational and dynamic properties** of proteins are exploited by evolution to regulate their **biological function**.

- At the **physical level** one needs to introduce **energetic consideration** to evaluate the likelihood that the identified **conformational ensembles** represent **conformational states** that the protein (or domain under study) samples in isolation. Such evaluation is notoriously **challenging** and can only be roughly approximated by using **computational methods** to evaluate the extent to which the observed **conformational ensembles** can be reproduced by algorithms that simulate the **dynamic behavior** of protein systems. These algorithms include the computationally expensive **classical molecular dynamics (MD) simulations** to sample local thermal fluctuations but also faster more approximate methods such as **Elastic Network Models** and **Normal Node Analysis** (NMA) to model low energy **collective motions**. Alternatively, **enhanced sampling molecular dynamics** can be used to model complex types of **conformational changes** but at a very high computational cost. The **ELIXIR 3D-Bioinfo Implementation Study** *Building on PDBe-KB to chart and characterize the conformation landscape of native proteins* focuses on: 1. Mapping the **conformational diversity** of proteins and their homologs across the PDB. 2. Characterize the different **flexibility properties** of protein regions, and link this information to sequence and functional annotation. 3. Benchmark **computational methods** that can predict a biophysical description of protein motions. This notebook is part of the third objective, where a list of **computational resources** that are able to predict **protein flexibility** and **conformational ensembles** have been collected, evaluated, and integrated in reproducible and interoperable workflows using the **BioExcel Building Blocks library**. Note that the list is not meant to be exhaustive, it is built following the expertise of the implementation study partners. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.490.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Protein conformational ensembles generation" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T10:05:54Z"^^ns1:Date ; ns1:dateModified "2023-06-01T10:07:09Z"^^ns1:Date ; ns1:description """The notebook shows how to load an IDR image with labels. The image is referenced in the paper "NesSys: a novel method for accurate nuclear segmentation in 3D" published August 2019 in PLOS Biology: https://doi.org/10.1371/journal.pbio.3000388 and can be viewed online in the Image Data Resource. In this notebook, the image is loaded together with the labels and analyzed using StarDist. The StarDist analysis produces a segmentation, which is then viewed side-by-side with the original segmentations produced by the authors of the paper obtained via the loaded labels. ## Launch This notebook uses the [environment_stardist.yml](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/environment_stardist.yml) file. See [Setup](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/setup.md).""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.493.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Validate a tool against IDR data: Load Image with labels from IDR, re-analyze using StarDist" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T10:14:31Z"^^ns1:Date ; ns1:dateModified "2023-06-01T10:53:01Z"^^ns1:Date ; ns1:description """## Learning Objectives - How to access genomic resource via its Python API - How to access image resource via its Python API - Relate image data to genomic data ## Diabetes related genes expressed in pancreas This notebook looks at the question **Which diabetes related genes are expressed in the pancreas?** Tissue and disease can be modified. Steps: - Query [humanmine.org](https://www.humanmine.org/humanmine), an integrated database of Homo sapiens genomic data using the intermine API to find the genes. - Using the list of found genes, search in the [Image Data Resource (IDR)](https://idr.openmicroscopy.org/) for images linked to the genes, tissue and disease. - Analyse the images found. ## Launch This notebook uses the [environment.yml](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/environment.yml) file. See [Setup](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_4/setup.md). """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.494.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "imaging, Python" ; ns1:license ; ns1:name "Use Public Resources to answer a biological question" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T10:30:51Z"^^ns1:Date ; ns1:dateModified "2023-07-04T00:40:25Z"^^ns1:Date ; ns1:description """## Learning objectives - Read data to analyse from an object store. - Analyse data in parallel using Dask. - Show how to use public resources to train neural network. - Load labels associated to the original data - Compare results with ground truth. The authors of the PLOS Biology paper, "Nessys: A new set of tools for the automated detection of nuclei within intact tissues and dense 3D cultures" published in August 2019: https://doi.org/10.1371/journal.pbio.3000388, considered several image segmenation packages, but they did not use the approach described in this notebook. We will analyse the data using [Cellpose](https://www.cellpose.org/) and compare the output with the original segmentation produced by the authors. Cellpose was not considered by the authors. Our workflow shows how public repository can be accessed and data inside it used to validate software tools or new algorithms. We will use a predefined model from Cellpose as a starting point. ## Launch This notebook uses the [environment.yml](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_5/environment.yml) file. See [Setup](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_5/setup.md).""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.495.1" ; ns1:image ; ns1:keywords "OME-Zarr, Python, imaging, image processing, Machine Learning, S3" ; ns1:license ; ns1:name "Load ome.zarr Image with labels from public S3 repositories, analyze in parallel using Cellpose and compare results" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-01T10:49:40Z"^^ns1:Date ; ns1:dateModified "2023-06-01T10:50:42Z"^^ns1:Date ; ns1:description """The image is referenced in the paper "NesSys: a novel method for accurate nuclear segmentation in 3D" published August 2019 in PLOS Biology: https://doi.org/10.1371/journal.pbio.3000388 and can be viewed online in the [Image Data Resource](https://idr.openmicroscopy.org/webclient/?show=image-6001247). This original image was converted into the Zarr format. The analysis results produced by the authors of the paper were converted into labels and linked to the Zarr file which was placed into a public S3 repository. In this notebook, the Zarr file is then loaded together with the labels from the S3 storage and analyzed using [StarDist](https://github.com/stardist/stardist). The StarDist analysis produces a segmentation, which is then viewed side-by-side with the original segmentations produced by the authors of the paper obtained via the loaded labels. ## Launch This notebook uses the [environment_stardist.yml](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_5/environment_stardist.yml) file. See [Setup](https://github.com/ome/EMBL-EBI-imaging-course-05-2023/blob/main/Day_5/setup.md).""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.496.1" ; ns1:image ; ns1:keywords "OME-Zarr, Python, Machine Learning, imaging, S3" ; ns1:license ; ns1:name "Load ome.zarr Image with labels from a public S3 repository, analyze using StarDist and compare results" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2020-07-24T13:00:50Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:46:05Z"^^ns1:Date ; ns1:description "Galaxy-E (ecology.usegalaxy.eu) workflow to calculate species presence / absence, community metrics and compute generalized linear models to identify effects and significativity of these effects on biodiversity." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.49.2" ; ns1:image ; ns1:input , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "Community_metrics, Presence_absence, GLM, Ecology, Biodiversity, Species abundance, Modeling, Statistics" ; ns1:license ; ns1:name "Population and community metrics calculation from Biodiversity data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T12:06:00Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:39:49Z"^^ns1:Date ; ns1:description "Preprocessing of raw SARS-CoV-2 reads. This workflow contains an alternate starting point to avoid the data to be downloaded from the NCBI SRA. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - Read pre-processing without downloading from SRA" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1_Input Dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2_Input Dataset" . a ns1:Person ; ns1:name "Milad Miladi" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "draft.fa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fast5_files.tar.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reads.fasta" . a ns1:ComputerLanguage ; ns1:alternateName "RMD" ; ns1:name "R markdown" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-07T17:32:16Z"^^ns1:Date ; ns1:dateModified "2023-06-07T17:52:17Z"^^ns1:Date ; ns1:description """# Introduction This repository contains all the custom scripts used in the evaluation and comparison of [Katdetectr](https://github.com/ErasmusMC-CCBC/evaluation_katdetectr/tree/main) as described in the corresponding Technical Note (under submission). # Usage All required files were deposited on [Zenodo](https://zenodo.org/record/6623289#.YqBxHi8Rr0o%5D). These can directly be downloaded using `zen4R` and be used as input. ```R # Increase the timeout (due to some large files). options(timeout=5000) # Download the required files into the data/ folder (~1GB). zen4R::download_zenodo(doi = "10.5281/zenodo.6810477", path = 'data/') ```""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.500.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Performance evaluation of Katdetectr and other kataegis detection packages" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "qmd" ; ns1:name "Quarto Markdown" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , ; ns1:dateCreated "2024-07-26T13:08:32Z"^^ns1:Date ; ns1:dateModified "2024-07-26T16:03:14Z"^^ns1:Date ; ns1:description "This publication corresponds to the Research Objects (RO) of the Baseline Use Case proposed in T.5.2 (WP5) in the BY-COVID project on “COVID-19 Vaccine(s) effectiveness in preventing SARS-CoV-2 infection”." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.502.4" ; ns1:isBasedOn ; ns1:keywords "covid-19, vaccines, comparative effectiveness, causal inference, international comparison, SARS-CoV-2, common data model, directed acyclic graph, synthetic data" ; ns1:license ; ns1:name "BY-COVID WP5 T5.2 Baseline Use Case" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-03-16T07:45:11Z"^^ns1:Date ; ns1:dateModified "2024-03-20T16:10:05Z"^^ns1:Date ; ns1:description """## Purge dups This snakemake pipeline is designed to be run using as input a contig-level genome and pacbio reads. This pipeline has been tested with `snakemake v7.32.4`. Raw long-read sequencing files and the input contig genome assembly must be given in the `config.yaml` file. To execute the workflow run: `snakemake --use-conda --cores N` Or configure the cluster.json and run using the `./run_cluster` command""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.506.2" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "Assembly, Genomics, Snakemake, Bioinformatics, Genome assembly, Biodiversity" ; ns1:license ; ns1:name "Purge retained haplotypes using Purge-Dups" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-08-05T12:52:50Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Basic workflows inspired by the Nanopolish tutorials" ; ns1:input , , ; ns1:keywords "" ; ns1:license ; ns1:name "ONT --Tutorial-Nanopolish-variants" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Milad Miladi" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "PkorrP19E3_ONT_fast5.tar.gz" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "collision_info" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "db_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "db_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gnps_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hmdb_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mbank_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mzml_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ppmx" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "python_script" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "r_script" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "candidate_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "result" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , ; ns1:dateCreated "2023-08-01T14:21:50Z"^^ns1:Date ; ns1:dateModified "2023-08-01T14:22:09Z"^^ns1:Date ; ns1:description """ This repository hosts Metabolome Annotation Workflow (MAW). The workflow takes MS2 .mzML format data files as an input in R. It performs spectral database dereplication using R Package Spectra and compound database dereplication using SIRIUS OR MetFrag . Final candidate selection is done in Python using RDKit and PubChemPy.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.510.2" ; ns1:image ; ns1:input , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Metabolomics, Annotation, mass-spectrometry, identification, Bioinformatics, FAIR workflows, workflow, gnps, massbank, hmdb, spectra, rdkit, Cheminformatics" ; ns1:license ; ns1:name "Metabolome Annotation Workflow (MAW)" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Ana Ayala" . a ns1:Person ; ns1:name "Dennis Trolle" . a ns1:Person ; ns1:name "Don Pierson" . a ns1:Person ; ns1:name "Inmaculada Jiménez-Navarro" . a ns1:Person ; ns1:name "Javier Senent-Aparicio" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , ; ns1:dateCreated "2024-01-16T13:18:14Z"^^ns1:Date ; ns1:dateModified "2024-01-16T13:19:15Z"^^ns1:Date ; ns1:description "Simulations and figures supporting the manuscript \"Timing of spring events changes under modelled future climate scenarios in a mesotrophic lake\"" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.511.5" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Timing of spring events changes under modelled future climate scenarios in a mesotrophic lake" ; ns1:producer ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "barcodes.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "genes.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "matrix.mtx" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "AnnData Loaded" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:44:52Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:44:52Z"^^ns1:Date ; ns1:description "Loads a single cell counts matrix into an annData format - adding a column called sample with the sample name. (Input format - matrix.mtx, features.tsv and barcodes.tsv)" ; ns1:input , , , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq: Load counts matrix" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Barcode Whitelist file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Expected Cells" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Paired Fastqs for one sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference Genome fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reference annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/AnnData Loaded" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): Alignments" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): Barcode/Feature Statistic Summaries" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): Barcodes filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): Genes filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): Matrix Gene Counts filtered" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNA STARSolo on input dataset(s): log" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:47:35Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:47:35Z"^^ns1:Date ; ns1:description "Takes fastqs and reference data, to produce a single cell counts matrix into and save in annData format - adding a column called sample with the sample name. " ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq: Count and Load with starSOLO" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:19:29Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:55:19Z"^^ns1:Date ; ns1:description """Take a scRNAseq counts matrix from a single sample, and perform basic QC with scanpy. Then, do further processing by making a UMAP and clustering. Produces a processed AnnData object. Depreciated: use individual workflows insead for multiple samples""" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq Single Sample Processing Counts Matrix" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing)>, ; ns1:dateCreated "2023-06-23T14:03:17Z"^^ns1:Date ; ns1:dateModified "2023-11-24T08:43:23Z"^^ns1:Date ; ns1:description """**Name:** SparseLU **Contact Person:** support-compss@bsc.es **Access Level:** public **License Agreement:** Apache2 **Platform:** COMPSs # Description The Sparse LU application computes an LU matrix factorization on a sparse blocked matrix. The matrix size (number of blocks) and the block size are parameters of the application. As the algorithm progresses, the area of the matrix that is accessed is smaller; concretely, at each iteration, the 0th row and column of the current matrix are discarded. On the other hand, due to the sparseness of the matrix, some of its blocks might not be allocated and, therefore, no work is generated for them. When executed with COMPSs, Sparse LU produces several types of task with different granularity and numerous dependencies between them. # Versions There are three versions of Sparse LU, depending on the data types used to store the blocks. ## Version 1 ''files'', where the matrix blocks are stored in files. ## Version 2 ''objects'', where the matrix blocks are represented by objects. ## Version 3 ''arrays'', where the matrix blocks are stored in arrays. # Execution instructions Usage: ``` runcompss sparseLU.files.SparseLU numberOfBlocks blockSize runcompss sparseLU.objects.SparseLU numberOfBlocks blockSize runcompss sparseLU.arrays.SparseLU numberOfBlocks blockSize ``` where: * numberOfBlocks: Number of blocks inside each matrix * blockSize: Size of each block # Execution Example ``` runcompss sparseLU.objects.SparseLU 16 4 runcompss sparseLU.files.SparseLU 16 4 runcompss sparseLU.arrays.SparseLU 16 4 ``` # Build ## Option 1: Native java ``` cd application_sources/; javac src/main/java/sparseLU/*/*.java cd src/main/java/; jar cf sparseLU.jar sparseLU/ cd ../../../; mv src/main/java/sparseLU.jar jar/ ``` ## Option 2: Maven ``` cd application_sources/ mvn clean package ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.515.1" ; ns1:image ; ns1:keywords "Java, COMPSs, Tutorial, Example, Marenostrum IV, Supercomputer, data_persistence" ; ns1:license ; ns1:name "Java COMPSs LU Factorization for Sparse Matrices" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-06-27T07:39:45Z"^^ns1:Date ; ns1:dateModified "2023-06-27T09:25:04Z"^^ns1:Date ; ns1:description """The project allowed us to manage and build structured code scripts on the Jupyter Notebook, a simple web application which is user-friendly, flexible to use in the research community. The script is developed to address the specific needs of research between different platforms of dataset. These stakeholders have developed their own platforms for the annotation and standardisation of both data and metadata produced within their respective field. -The INFRAFRONTIER - European Mutant Mouse Archive (EMMA) comprises over 7200 mutant mouse lines that are extensively integrated and enriched with other public dataset. -The EU-OpenScreen offers compound screening protocols containing several metadata and will contribute to the development of tools for linking to the chemical entity database. -The IDR Image Data Resource is a public repository of reference image datasets from published scientific studies, where the community can submit, search and access high-quality bio-image data. -The CIM-XNAT is an XNAT deployment of the Molecular Imaging Center at UniTo that offers a suite of tools for uploading preclinical images. To address the challenges of integrating several EU-RI datasets with focus on preclinical and discovery research bioimaging, our aim is to develop cross researching queries through a web based interface to combine the resources of the RIs for integrating the information associated with data belonging to the involved RIs. Furthermore, the open-source tool provides users with free, open access to collections of datasets distributed over multiple sources that result from searches by specific keywords. The script allows the cross research in different fields of research as: Species, Strain, Gene, Cell line, Disease model, Chemical Compound. The novel aspects of this tool are mainly: a) user friendly, e.g. the user has the flexibility to research among the dataset easily with a simple API, intuitive for researchers and biomedical users. b) the possibility of making a research between different platforms and repositories, from a unique simple way. c) the workflow project follows the FAIR principles in the treatment of data and datasets. The access to Notebook Jupyter needs the installation of Anaconda, which consents to open the web application. Inside the Jupyter, the script was built using Python. The query code is also easy to download and share in a .ipynb file. A visual representation of the detailed results (dataset, metadata, information, query results) of the workflow can be printed immediately after the query run. """ ; ns1:keywords "" ; ns1:license ; ns1:name "Life Science cross-RI (Research Infrastructure) project" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ploidy file (Y for happloid)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-06-27T14:41:53Z"^^ns1:Date ; ns1:dateModified "2025-06-19T10:24:37Z"^^ns1:Date ; ns1:description """SINGLE-END workflow. Align reads on fasta reference/assembly using bwa mem, get a consensus, variants, mutation explanations. IMPORTANT: * For "bcftools call" consensus step, the --ploidy file is in "Données partagées" (Shared Data) and must be imported in your history to use the worflow by providing this file (tells bcftools to consider haploid variant calling). * SELECT the mot ADAPTED VADR MODEL for annotation (see vadr parameters).""" ; ns1:image ; ns1:input ; ns1:keywords "single-end, Annotation, variant, Virus, variant_calling, high-throughput_sequencing_analysis, Galaxy, Bioinformatics, SNPs, variant calling" ; ns1:license ; ns1:name "VVV2_align_SE" ; ns1:output , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ploidy file (Y for happloid)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-06-28T09:52:35Z"^^ns1:Date ; ns1:dateModified "2025-06-19T10:23:49Z"^^ns1:Date ; ns1:description """PAIRED-END workflow. Align reads on fasta reference/assembly using bwa mem, get a consensus, variants, mutation explanations. IMPORTANT: * For "bcftools call" consensus step, the --ploidy file is in "Données partagées" (Shared Data) and must be imported in your history to use the worflow by providing this file (tells bcftools to consider haploid variant calling). * SELECT THE MOST ADAPTED VADR MODEL for annotation (see vadr parameters). """ ; ns1:image ; ns1:input ; ns1:keywords "paired-end, variant_calling, Annotation, Virus, Alignment, Bioinformatics, Galaxy, SNPs, covid-19, variant calling, workflow" ; ns1:license ; ns1:name "VVV2_align_PE" ; ns1:output , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Bradley W. Langhorst" . a ns1:Person ; ns1:name "Peter van Heusden" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Max Viz. Coverage Threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum quality score to call base" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired read collection for samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Primer BED" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read fraction to call variant" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference FASTA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamqc_report_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_multifasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ivar_consensus_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ivar_variants_tabular" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "primer_trimmed_bam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "snpeff_annotated_vcf" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-06-28T11:42:46Z"^^ns1:Date ; ns1:dateModified "2023-06-30T05:14:33Z"^^ns1:Date ; ns1:description "SARS-CoV-2 variant prediction using Read It And Keep, fastp, bbmap and iVar" ; ns1:input , , , , , ; ns1:keywords "covid-19, ARTIC, SARS-CoV-2, SANBI" ; ns1:license ; ns1:name "SARS-CoV-2 Illumina Amplicon pipeline - SANBI - v1.2" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-08-05T13:01:18Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Genome assembly: Flye-based WF for highly repetitive genomes [Schmid et al. NAR 2018]" ; ns1:input <#ont____assembly_flye_ahrenslab-inputs-ftp://biftp.informatik.uni-freiburg.de/pub/T0/Ahrens/SRR6982805.fastq>, ; ns1:keywords "name:ONT, ONT" ; ns1:license ; ns1:name "ONT -- Assembly-Flye-AhrensLab" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Milad Miladi" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Aligned reads (BAM)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Amplicons" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Consensus Genomes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference Genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SARS-CoV-2 Annotation (GFF3)" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-06-28T12:12:55Z"^^ns1:Date ; ns1:dateModified "2023-06-28T12:12:55Z"^^ns1:Date ; ns1:description "" ; ns1:input , , , , ; ns1:keywords "SARS-CoV-2, SANBI" ; ns1:license ; ns1:name "SARS-CoV-2 PostProcessing" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Medaka model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum read length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Primer BED file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sequence Reads" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-06-29T12:07:19Z"^^ns1:Date ; ns1:dateModified "2023-06-29T12:07:19Z"^^ns1:Date ; ns1:description "" ; ns1:input , , , , , ; ns1:keywords "SARS-CoV-2, SANBI, nanopore" ; ns1:license ; ns1:name "SARS-CoV-2 ONT Amplicon Sequencing SANBI 1.0" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_confounders" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_custom_model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_feature" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_measure" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_mod" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_mod0" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_phenotype" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_phenotype_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bg_timecourse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_alpha" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_blind" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_contrast" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_cores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_denominator" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_design" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_hidden_batch_effects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_hidden_batch_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_hidden_batch_row_means" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_hypothesis" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_lfcThreshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_min_sum_of_reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_numerator" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_pAdjustMethod" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_parallelization" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_phenotype" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_reduced" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_reference_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_transformation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "deseq2_variables" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastx_first_base_to_keep" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fastx_last_base_to_keep" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts_annotation_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts_number_of_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts_output_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts_read_meta_feature_overlap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_alignments_tailored_trans_assemb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_idx_basename" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_idx_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_known_splicesite_infile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_num_of_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_fwd_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_qc_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_trimming_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "premapping_input_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw_files_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_compression_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_memory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_sort_by_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_cigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_collapsecigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_fastcompression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_isbam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_iscram" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_randomseed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsingroup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsinlibrary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsquality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithoutbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readtagtostrip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_region" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_samheader" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_uncompressed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_ballgown_table_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_conservative_mode" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_cpus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_expression_estimation_mode" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_guide_gff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_junction_coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_min_isoform_abundance" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_min_read_coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_out_gtf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_transcript_merge_mode" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "stringtie_verbose" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_compression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_do_not_compress" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_strigency" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_trim_suffix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ballgown_de_custom_model" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ballgown_de_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ballgown_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_collect_hisat2_sam_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_deseq2_dds_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_deseq2_de_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_deseq2_res_lfcShrink_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_deseq2_transformed_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastx_trimmer_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastx_trimmer_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_featureCounts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_paired_reads_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_single_reads_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_view" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_stringtie_expression_gtf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_stringtie_expression_outdir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_stringtie_merge" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_stringtie_transcript_assembly_gtf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_reports" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-07-05T08:44:44Z"^^ns1:Date ; ns1:dateModified "2023-07-05T09:16:36Z"^^ns1:Date ; ns1:description """A CWL-based pipeline for processing RNA-Seq data (FASTQ format) and performing differential gene/transcript expression analysis. On the respective GitHub folder are available: - The CWL wrappers for the workflow - A pre-configured YAML template, based on validation analysis of publicly available HTS data - A table of metadata (``mrna_cll_subsets_phenotypes.csv``), based on the same validation analysis, to serve as an input example for the design of comparisons during differential expression analysis Briefly, the workflow performs the following steps: 1. Quality control of Illumina reads (FastQC) 2. Trimming of the reads (e.g., removal of adapter and/or low quality sequences) (Trim galore) 3. (Optional) custom processing of the reads using FASTA/Q Trimmer (part of the FASTX-toolkit) 4. Mapping to reference genome (HISAT2) 5. Convertion of mapped reads from SAM (Sequence Alignment Map) to BAM (Binary Alignment Map) format (samtools) 6. Sorting mapped reads based on chromosomal coordinates (samtools) Subsequently, two independent workflows are implemented for differential expression analysis at the transcript and gene level. **First**, following the [reference protocol](https://doi.org/10.1038/nprot.2016.095) for HISAT, StringTie and Ballgown transcript expression analysis, StringTie along with a reference transcript annotation GTF (Gene Transfer Format) file (if one is available) is used to: - Assemble transcripts for each RNA-Seq sample using the previous read alignments (BAM files) - Generate a global, non-redundant set of transcripts observed in any of the RNA-Seq samples - Estimate transcript abundances and generate read coverage tables for each RNA-Seq sample, based on the global, merged set of transcripts (rather than the reference) which is observed across all samples Ballgown program is then used to load the coverage tables generated in the previous step and perform statistical analyses for differential expression at the transcript level. Notably, the StringTie - Ballgown protocol applied here was selected to include potentially novel transcripts in the analysis. **Second**, featureCounts is used to count reads that are mapped to selected genomic features, in this case genes by default, and generate a table of read counts per gene and sample. This table is passed as input to DESeq2 to perform differential expression analysis at the gene level. Both Ballgown and DESeq2 R scripts, along with their respective CWL wrappers, were designed to receive as input various parameters, such as experimental design, contrasts of interest, numeric thresholds, and hidden batch effects. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.524.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "RNASEQ, Transcriptomics, CWL, workflow" ; ns1:license ; ns1:name "CWL-based RNA-Seq workflow" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ChIPQC_annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ChIPQC_bCount" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ChIPQC_blacklist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ChIPQC_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ChIPQC_facetBy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_bParallel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_background" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_blacklist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_cores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_design" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_filterFun" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_greylist" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_library" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_low_read_count_filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_minOverlap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_normalization" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_reorderMeta_factor" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_reorderMeta_value" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DiffBind_retrieve_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamCoverage_effective_genome_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamCoverage_extendReads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamCoverage_normalizeUsing" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bamCoverage_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "blackListFile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_downstream" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_outFileSortedRegions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_outputFile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "computeMatrix_upstream" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_idx_basename" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_idx_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "hisat2_num_of_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_control_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_fwd_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_qc_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_treatment_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_trimming_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_callpeak_bdg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_callpeak_broad" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_callpeak_format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_callpeak_gsize" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_callpeak_nomodel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_extsize" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_pvalue" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_qvalue" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "macs2_shift" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metadata_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiBamSummary_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_color" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_method" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_numbers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_outFileName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_plotType" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCorrelation_title" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCoverage_outFileName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCoverage_plotFileFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotCoverage_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotFingerprint_outFileName" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotFingerprint_plotFileFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotFingerprint_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotHeatmap_outputFile" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "plotHeatmap_plotFileFormat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw_files_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rose_genome_build" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rose_stitch_distance" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "rose_tss_distance" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_output_format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_markdup_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_readswithoutbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_compression_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_memory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_illuminaClip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_leading" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_minlen" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_slidingWindow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_pe_trailing" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_illuminaClip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_leading" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_minlen" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_slidingWindow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "trimmomatic_se_trailing" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_macs_ChIPQCexperiment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_macs_ChIPQCreport" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_macs_outdir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_rose_ChIPQCexperiment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_rose_ChIPQCreport" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_ChIPQC_rose_outdir" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_macs_correlation_heatmap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_macs_diffbind_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_macs_diffbind_dba_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_macs_diffbind_normalized_counts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_macs_diffbind_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_rose_correlation_heatmap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_rose_diffbind_consensus" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_rose_diffbind_dba_object" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_rose_diffbind_normalized_counts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_DiffBind_rose_diffbind_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_append_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bamCoverage_norm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bed_to_rose_gff_conversion" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bedtools_coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bedtools_intersect" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bedtools_merge" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_computeMatrix_matrix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_computeMatrix_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_enhancer_bed_processing" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_exclude_black_list_regions" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_exclude_black_list_regions_narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_html_fwd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_html_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_paired_reads_sam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_paired_reads_stderr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_single_reads_sam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_hisat2_for_single_reads_stderr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_bed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_broadPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_cutoff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_gappedPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_lambda" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_model_r" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_narrowPeak" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_pileup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_macs2_call_peaks_xls" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_multiBamSummary_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_paste_content_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_paste_content_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_plotCorrelation_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_plotCoverage_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_plotFingerprint_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_plotHeatmap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_printf_header_samples" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_AllEnhancers_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_Enhancers_withSuper" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_Plot_points" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_STITCHED_ENHANCER_REGION_MAP" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_SuperEnhancers_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_gff_dir_outputs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_rose_main_mappedGFF_dir_outputs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_fixmate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_markdup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort_by_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_sort_peaks_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_total_peaks_table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_paired_end_fwd_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_paired_end_fwd_unpaired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_paired_end_rev_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_paired_end_rev_unpaired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_paired_end_stderr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_single_end_fastq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trimmomatic_single_end_stderr" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-07-05T09:39:05Z"^^ns1:Date ; ns1:dateModified "2023-07-05T09:39:32Z"^^ns1:Date ; ns1:description """A CWL-based pipeline for processing ChIP-Seq data (FASTQ format) and performing: - Peak calling - Consensus peak count table generation - Detection of super-enhancer regions - Differential binding analysis On the respective GitHub folder are available: - The CWL wrappers for the workflow - A pre-configured YAML template, based on validation analysis of publicly available HTS data - Tables of metadata (``EZH2_metadata_CLL.csv`` and ``H3K27me3_metadata_CLL.csv``), based on the same validation analysis, to serve as input examples for the design of comparisons during differential binding analysis - A list of ChIP-Seq blacklisted regions (human genome version 38; hg38) from the ENCODE project, which is can be used as input for the workflow, is provided in BED format (``hg38-blacklist.v2.bed``) Briefly, the workflow performs the following steps: 1. Quality control of short reads (FastQC) 2. Trimming of the reads (e.g., removal of adapter and/or low quality sequences) (Trimmomatic) 3. Mapping to reference genome (HISAT2) 5. Convertion of mapped reads from SAM (Sequence Alignment Map) to BAM (Binary Alignment Map) format (samtools) 6. Sorting mapped reads based on chromosomal coordinates (samtools) 7. Adding information regarding paired end reads (e.g., CIGAR field information) (samtools) 8. Re-sorting based on chromosomal coordinates (samtools) 9. Removal of duplicate reads (samtools) 10. Index creation for coordinate-sorted BAM files to enable fast random access (samtools) 11. Production of quality metrics and files for the inspection of the mapped ChIP-Seq reads, taking into consideration the experimental design (deeptools2): - Read coverages for genomic regions of two or more BAM files are computed (multiBamSummary). The results are produced in compressed numpy array (NPZ) format and are used to calculate and visualize pairwise correlation values between the read coverages (plotCorrelation). - Estimation of sequencing depth, through genomic position (base pair) sampling, and visualization is performed for multiple BAM files (plotCoverage). - Cumulative read coverages for each indexed BAM file are plotted by counting and sorting all reads overlapping a “window” of specified length (plotFingerprint). - Production of coverage track files (bigWig), with the coverage calculated as the number of reads per consecutive windows of predefined size (bamCoverage), and normalized through various available methods (e.g., Reads Per Kilobase per Million mapped reads; RPKM). The coverage track files are used to calculate scores per selected genomic regions (computeMatrix), typically genes, and a heatmap, based on the scores associated with these genomic regions, is produced (plotHeatmap). 12. Calling potential binding positions (peaks) to the genome (peak calling) (MACS2) 13. Generation of consensus peak count table for the application of custom analyses on MACS2 peak calling results (bedtools) 14. Detection of super-enhancer regions (Rank Ordering of Super-Enhancers; ROSE) 15. Differential binding analyses (DiffBind) for: - MACS2 peak calling results - ROSE-detected super-enhancer regions """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.525.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "CWL, workflow, ChIP-seq, Epigenomics" ; ns1:license ; ns1:name "CWL-based ChIP-Seq workflow" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ApplyVQSR_ts_filter_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantRecalibrator_trust_all_polymorphic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantRecalibrator_truth_sensitivity_trance_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantRecalibrator_truth_sensitivity_trance_snps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantRecalibrator_use_annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_norm_multiallelics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_norm_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_view_include_VQSR_filters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bwa_mem_num_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bwa_mem_sec_shorter_split_hits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_exclude_intervalList" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_include_intervalList" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_scatter_count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_fwd_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_qc_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_trimming_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "picard_addorreplacereadgroups_rgpl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw_files_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_output_format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_flagstat_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_compression_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_memory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_cigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_collapsecigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_fastcompression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_iscram" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_randomseed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsingroup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsinlibrary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsquality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithoutbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readtagtostrip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_region" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_samheader" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_target_bed_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_uncompressed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_hc_java_options" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_hc_native_pairHMM_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_interval_padding" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_build_over" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_convert_arg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_database_location" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_na_string" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_operation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_otherinfo" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_protocol" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_remove" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_vcfinput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_compression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_do_not_compress" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_strigency" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_trim_suffix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_indels_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_indels_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_indels_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_snps_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_snps_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_snps_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_arguments_snps_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_indels_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_indels_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_indels_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_snps_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_snps_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_snps_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "vqsr_known_sites_snps_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_norm_vqsr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_view_filter_vqsr" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bwa_mem_paired" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bwa_mem_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gather_bwa_sam_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_ApplyVQSR_indel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_ApplyVQSR_snp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_CombineGVCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_GenotypeGVCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_MakeSitesOnlyVcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_VQSR_MergeVCFs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_VariantRecalibrator_indel_recal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_VariantRecalibrator_indel_tranches" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_VariantRecalibrator_snp_recal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_VariantRecalibrator_snp_tranches" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_bqsr_subworkflowbqsr_bqsr_bam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_bqsr_subworkflowbqsr_hc" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_bqsr_subworkflowbqsr_mergevcfs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_bqsr_subworkflowbqsr_tables" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_splitintervals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_addorreplacereadgroups" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_markduplicates" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_markduplicates_metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_fixmate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_flagstat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_view_conversion" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_view_count_total" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_filtered_avinput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_filtered_multianno_txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_filtered_multianno_vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_by_name" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-07-05T09:44:42Z"^^ns1:Date ; ns1:dateModified "2023-07-05T09:45:12Z"^^ns1:Date ; ns1:description """A CWL-based pipeline for calling small germline variants, namely SNPs and small INDELs, by processing data from Whole-genome Sequencing (WGS) or Targeted Sequencing (e.g., Whole-exome sequencing; WES) experiments. On the respective GitHub folder are available: - The CWL wrappers and subworkflows for the workflow - A pre-configured YAML template, based on validation analysis of publicly available HTS data Briefly, the workflow performs the following steps: 1. Quality control of Illumina reads (FastQC) 2. Trimming of the reads (e.g., removal of adapter and/or low quality sequences) (Trim galore) 3. Mapping to reference genome (BWA-MEM) 4. Convertion of mapped reads from SAM (Sequence Alignment Map) to BAM (Binary Alignment Map) format (samtools) 5. Sorting mapped reads based on read names (samtools) 6. Adding information regarding paired end reads (e.g., CIGAR field information) (samtools) 7. Re-sorting mapped reads based on chromosomal coordinates (samtools) 8. Adding basic Read-Group information regarding sample name, platform unit, platform (e.g., ILLUMINA), library and identifier (picard AddOrReplaceReadGroups) 9. Marking PCR and/or optical duplicate reads (picard MarkDuplicates) 10. Collection of summary statistics (samtools) 11. Creation of indexes for coordinate-sorted BAM files to enable fast random access (samtools) 12. Splitting the reference genome into a predefined number of intervals for parallel processing (GATK SplitIntervals) At this point the application of multi-sample workflow follows, during which multiple samples are concatenated into a single, unified VCF (Variant Calling Format) file, which contains the variant information for all samples: 13. Application of Base Quality Score Recalibration (BQSR) (GATK BaseRecalibrator and ApplyBQSR tools) 14. Variant calling in gVCF (genomic VCF) mode (-ERC GVCF) (GATK HaplotypeCaller) 15. Merging of all genomic interval-split gVCF files for each sample (GATK MergeVCFs) 16. Generation of the unified VCF file (GATK CombineGVCFs and GenotypeGVCFs tools) 17. Separate annotation for SNP and INDEL variants, using the Variant Quality Score Recalibration (VQSR) method (GATK VariantRecalibrator and ApplyVQSR tools) 18. Variant filtering based on the information added during VQSR and/or custom filters (bcftools) 19. Normalization of INDELs (split multiallelic sites) (bcftools) 20. Annotation of the final dataset of filtered variants with genomic, population-related and/or clinical information (ANNOVAR) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.526.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "CWL, workflow, Germline, variant calling, Genomics" ; ns1:license ; ns1:name "CWL-based (multi-sample) workflow for germline variant calling" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FilterVariantTranches_resource_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FilterVariantTranches_resource_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FilterVariantTranches_resource_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_cluster" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_filter_indel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_filter_name_indel" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_filter_name_snp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_filter_snp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "VariantFiltration_window" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_norm_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_view_include_CNN_filters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_view_include_hard_filters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bcftoomls_norm_multiallelics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bwa_mem_num_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bwa_mem_sec_shorter_split_hits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_exclude_intervalList" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_include_intervalList" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gatk_splitintervals_scatter_count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_fwd_single" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_file_split_rev" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_qc_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "input_trimming_check" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "picard_addorreplacereadgroups_rgpl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw_files_directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reference_genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_output_format" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_fixmate_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_flagstat_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_compression_level" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_memory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_sort_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_cigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_collapsecigar" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_count" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_fastcompression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_iscram" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_randomseed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsingroup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsinlibrary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readsquality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readswithoutbits" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_readtagtostrip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_region" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_samheader" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "samtools_view_uncompressed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_interval_padding" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_bqsr_known_sites_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_hc_java_options" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sub_hc_native_pairHMM_threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_build_over" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_convert_arg" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_database_location" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_na_string" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_operation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_otherinfo" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_protocol" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_remove" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "table_annovar_vcfinput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_compression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_do_not_compress" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_quality" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_strigency" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "tg_trim_suffix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_concat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_norm_cnn" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_norm_hard_filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_view_filter_cnn" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_bcftools_view_hard_filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_paired_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_raw_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_html" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_fastqc_single_zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gather_bwa_sam_files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_ApplyBQSR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_CNNScoreVariants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_FilterVariantTranches" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_HaplotypeCaller" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_bqsr_subworkflow" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_gatk_splitintervals" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_addorreplacereadgroups" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_markduplicates" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_picard_markduplicates_metrics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_fixmate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_flagstat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_index_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_sort_by_name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_view_conversion" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_samtools_view_count_total" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_tabix_indels" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_tabix_snps" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_cnn_filtered_avinput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_cnn_filtered_multianno_txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_cnn_filtered_multianno_vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_hard_filtered_avinput" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_hard_filtered_multianno_txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_table_annovar_hard_filtered_multianno_vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_paired_reports" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_fq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "o_trim_galore_single_reports" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-07-05T09:48:09Z"^^ns1:Date ; ns1:dateModified "2023-07-05T09:49:33Z"^^ns1:Date ; ns1:description """A CWL-based pipeline for calling small germline variants, namely SNPs and small INDELs, by processing data from Whole-genome Sequencing (WGS) or Targeted Sequencing (e.g., Whole-exome sequencing; WES) experiments. On the respective GitHub folder are available: - The CWL wrappers and subworkflows for the workflow - A pre-configured YAML template, based on validation analysis of publicly available HTS data Briefly, the workflow performs the following steps: 1. Quality control of Illumina reads (FastQC) 2. Trimming of the reads (e.g., removal of adapter and/or low quality sequences) (Trim galore) 3. Mapping to reference genome (BWA-MEM) 4. Convertion of mapped reads from SAM (Sequence Alignment Map) to BAM (Binary Alignment Map) format (samtools) 5. Sorting mapped reads based on read names (samtools) 6. Adding information regarding paired end reads (e.g., CIGAR field information) (samtools) 7. Re-sorting mapped reads based on chromosomal coordinates (samtools) 8. Adding basic Read-Group information regarding sample name, platform unit, platform (e.g., ILLUMINA), library and identifier (picard AddOrReplaceReadGroups) 9. Marking PCR and/or optical duplicate reads (picard MarkDuplicates) 10. Collection of summary statistics (samtools) 11. Creation of indexes for coordinate-sorted BAM files to enable fast random access (samtools) 12. Splitting the reference genome into a predefined number of intervals for parallel processing (GATK SplitIntervals) At this point the application of single-sample workflow follows, during which multiple samples are accepted as input and they are not merged into a unified VCF file but are rather processed separately in each step of the workflow, leading to the production of a VCF file for each sample: 13. Application of Base Quality Score Recalibration (BQSR) (GATK BaseRecalibrator, GatherBQSRReports and ApplyBQSR tools) 14. Variant calling (GATK HaplotypeCaller) 15. Merging of all genomic interval-split gVCF files for each sample (GATK MergeVCFs) 16. Separate annotation of SNPs and INDELs based on pretrained Convolutional Neural Network (CNN) models (GATK SelectVariants, CNNScoreVariants and FilterVariantTranches tools) 17. (Optional) Independent step of hard-filtering (GATK VariantFiltration) 18. Variant filtering based on the information added during VQSR and/or custom filters (bcftools) 19. Normalization of INDELs (split multiallelic sites) (bcftools) 20. Annotation of the final dataset of filtered variants with genomic, population-related and/or clinical information (ANNOVAR) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.527.1" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:keywords "CWL, workflow, Germline, variant calling, Genomics" ; ns1:license ; ns1:name "CWL-based (single-sample) workflow for germline variant calling" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-07-09T08:54:36Z"^^ns1:Date ; ns1:dateModified "2023-07-09T09:01:09Z"^^ns1:Date ; ns1:description """# prepareChIPs This is a simple `snakemake` workflow template for preparing **single-end** ChIP-Seq data. The steps implemented are: 1. Download raw fastq files from SRA 2. Trim and Filter raw fastq files using `AdapterRemoval` 3. Align to the supplied genome using `bowtie2` 4. Deduplicate Alignments using `Picard MarkDuplicates` 5. Call Macs2 Peaks using `macs2` A pdf of the rulegraph is available [here](workflow/rules/rulegraph.pdf) Full details for each step are given below. Any additional parameters for tools can be specified using `config/config.yml`, along with many of the requisite paths To run the workflow with default settings, simply run as follows (after editing `config/samples.tsv`) ```bash snakemake --use-conda --cores 16 ``` If running on an HPC cluster, a snakemake profile will required for submission to the queueing system and appropriate resource allocation. Please discuss this will your HPC support team. Nodes may also have restricted internet access and rules which download files may not work on many HPCs. Please see below or discuss this with your support team Whilst no snakemake wrappers are explicitly used in this workflow, the underlying scripts are utilised where possible to minimise any issues with HPC clusters with restrictions on internet access. These scripts are based on `v1.31.1` of the snakemake wrappers ### Important Note Regarding OSX Systems It should be noted that this workflow is **currently incompatible with OSX-based systems**. There are two unsolved issues 1. `fasterq-dump` has a bug which is specific to conda environments. This has been updated in v3.0.3 but this patch has not yet been made available to conda environments for OSX. Please check [here](https://anaconda.org/bioconda/sra-tools) to see if this has been updated. 2. The following error appears in some OSX-based R sessions, in a system-dependent manner: ``` Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : polygon edge not found ``` The fix for this bug is currently unknown ## Download Raw Data ### Outline The file `samples.tsv` is used to specify all steps for this workflow. This file must contain the columns: `accession`, `target`, `treatment` and `input` 1. `accession` must be an SRA accession. Only single-end data is currently supported by this workflow 2. `target` defines the ChIP target. All files common to a target and treatment will be used to generate summarised coverage in bigWig Files 3. `treatment` defines the treatment group each file belongs to. If only one treatment exists, simply use the value 'control' or similar for every file 4. `input` should contain the accession for the relevant input sample. These will only be downloaded once. Valid input samples are *required* for this workflow As some HPCs restrict internet access for submitted jobs, *it may be prudent to run the initial rules in an interactive session* if at all possible. This can be performed using the following (with 2 cores provided as an example) ```bash snakemake --use-conda --until get_fastq --cores 2 ``` ### Outputs - Downloaded files will be gzipped and written to `data/fastq/raw`. - `FastQC` and `MultiQC` will also be run, with output in `docs/qc/raw` Both of these directories are able to be specified as relative paths in `config.yml` ## Read Filtering ### Outline Read trimming is performed using [AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/). Default settings are customisable using config.yml, with the defaults set to discard reads shorter than 50nt, and to trim using quality scores with a threshold of Q30. ### Outputs - Trimmed fastq.gz files will be written to `data/fastq/trimmed` - `FastQC` and `MultiQC` will also be run, with output in `docs/qc/trimmed` - AdapterRemoval 'settings' files will be written to `output/adapterremoval` ## Alignments ### Outline Alignment is performed using [`bowtie2`](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) and it is assumed that this index is available before running this workflow. The path and prefix must be provided using config.yml This index will also be used to produce the file `chrom.sizes` which is essential for conversion of bedGraph files to the more efficient bigWig files. ### Outputs - Alignments will be written to `data/aligned` - `bowtie2` log files will be written to `output/bowtie2` (not the conenvtional log directory) - The file `chrom.sizes` will be written to `output/annotations` Both sorted and the original unsorted alignments will be returned. However, the unsorted alignments are marked with `temp()` and can be deleted using ```bash snakemake --delete-temp-output --cores 1 ``` ## Deduplication ### Outline Deduplication is performed using [MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-) from the Picard set of tools. By default, deduplication will remove the duplicates from the set of alignments. All resultant bam files will be sorted and indexed. ### Outputs - Deduplicated alignments are written to `data/deduplicated` and are indexed - DuplicationMetrics files are written to `output/markDuplicates` ## Peak Calling ### Outline This is performed using [`macs2 callpeak`](https://pypi.org/project/MACS2/). - Peak calling will be performed on: a. each sample individually, and b. merged samples for those sharing a common ChIP target and treatment group. - Coverage bigWig files for each individual sample are produced using CPM values (i.e. Signal Per Million Reads, SPMR) - For all combinations of target and treatment coverage bigWig files are also produced, along with fold-enrichment bigWig files ### Outputs - Individual outputs are written to `output/macs2/{accession}` + Peaks are written in `narrowPeak` format along with `summits.bed` + bedGraph files are automatically converted to bigWig files, and the originals are marked with `temp()` for subsequent deletion + callpeak log files are also added to this directory - Merged outputs are written to `output/macs2/{target}/` + bedGraph Files are also converted to bigWig and marked with `temp()` + Fold-Enrichment bigWig files are also created with the original bedGraph files marked with `temp()` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.528.1" ; ns1:keywords "Bioinformatics, Genomics, Transcriptomics" ; ns1:license ; ns1:name "prepareChIPs:" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-08-05T13:03:51Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Genome assembly: Unicycler-based WF for Klebsiella pneumoniae [Wick et al. Microbial genomics 2017]" ; ns1:input <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8811145>, <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8811148>, <#ont___workflow_wick_et_al_-inputs-https://ndownloader.figshare.com/files/8812159> ; ns1:keywords "ONT" ; ns1:license ; ns1:name "ONT - Workflow-Wick-et.al." ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Milad Miladi" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "fast5-Signals-Raw.tar.gz" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-08-05T13:05:34Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:46Z"^^ns1:Date ; ns1:description "Metagenomics: taxa classification" ; ns1:input ; ns1:keywords "ONT" ; ns1:license ; ns1:name "ONT -- Metagenomics-Kraken2-Krona" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-07-20T15:58:53Z"^^ns1:Date ; ns1:dateModified "2023-07-21T07:41:58Z"^^ns1:Date ; ns1:description "Sample workflow template that combines simulations with data analytics. It is not a real workflow, but it mimics this type of workflows. It illustrates how COMPSs invokes binaries. It can be extended to invoke MPI applications. " ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.541.1" ; ns1:image ; ns1:keywords "COMPSs, Hybrid Workflow, eFlows4HPC, non_data_persistence, Marenostrum IV, Supercomputer, PyCOMPSs" ; ns1:license ; ns1:name "Sample workflow that combines simulations with data analytics." ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Anand Maurya" . a ns1:Person ; ns1:name "Maciej Szymanski" . a ns1:Person ; ns1:name "Wojciech Karlowski" . a ns1:ComputerLanguage ; ns1:name "Perl" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-07-31T12:44:27Z"^^ns1:Date ; ns1:dateModified "2023-07-31T12:49:45Z"^^ns1:Date ; ns1:description """## ARA (Automated Record Analysis) : An automatic pipeline for exploration of SRA datasets with sequences as a query ### Requirements - **Docker** - Please checkout the [Docker installation](https://docs.docker.com/get-docker/) guide. _or_ - **Mamba package manager** - Please checkout the [mamba or micromamba](https://mamba.readthedocs.io/en/latest/installation.html) official installation guide. - We prefer `mamba` over [`conda`](https://docs.conda.io/en/latest/) since it is faster and uses `libsolv` to effectively resolve the dependencies. - `conda` can still be used to install the pipeline using the same commands as described in the installation section. > Note: **It is important to include the 'bioconda' channel in addition to the other channels as indicated in the [official manual](https://bioconda.github.io/#usage "Bioconda - Usage")**. Use the following commands in the given order to configure the channels (one-time setup). > > ```bash > conda config --add channels defaults > conda config --add channels bioconda > conda config --add channels conda-forge > conda config --set channel_priority strict > ``` --- ### Installation The user can install the pipeline by using either Docker or Mamba using the steps mentioned below. First, click the green "Code" button, then select "Download Zip" to begin downloading the contents of this repository. Once the download is complete, extract the zip file by into the desired location before starting the setup. Please use the commands shown below to begin installing the pipeline. Alternatively, the github repo can also be cloned through the options shown after clicking the "Code" button. Navigate inside the folder after by using the `cd ARA/` command before starting the setup. > _Warning: Before starting any analysis with the pipeline, please make sure that the system has enough disk space available for the data you wish to retrieve and process from the SRA repository._ - **Using Docker** ```bash cd ARA-main/ docker build -t ara_img . ``` _or_ - **Using Mamba** ```bash cd ARA-main/ mamba env create --file requirements.yaml mamba activate ara_env perl setup.pl ``` > _Note: After installation, the virtual environment consumes approximately 1.5 GB of disk space. The installation was tested on "Ubuntu 20.04.4 LTS", "Ubuntu 22.04.1 LTS" and "Fedora 37" using the procedure mentioned above._ Please be patient because downloading and configuring the tools/modules may take several minutes. The warning messages that appear during the installation of certain Perl modules can be ignored by users. Optional: The user can also add the current directory to PATH for ease of use. Use the `chmod +x ara.pl` followed by `export PATH="$(pwd):$PATH"` command. Alternatively, the user is free to create symbolic, copy the executable to `/bin/`, or use any other method depending on their operating system. Refer the 'Troubleshooting' section in case of any installation related issues. --- ### Example usage - **Docker** `docker run -it ara_img /home/ARA-main/ara.pl --input /home/ARA-main/example/SraRunInfo.csv --sequences /home/ARA-main/example/Arabidopsis_thaliana.TAIR10.ncrna.fa` - **Mamba environment** `perl ara.pl --input example/SraRunInfo.csv --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa` To get full usage info: `perl ara.pl --help` > _Note_: The user can delete the contents of `results/` directory after testing the tool using the example mentioned above. ### Configuration file The configuration file `conf.txt` is automatically generated during the installation by setup script. It contains certain default parameters as well as the location to the executable binaries of the tools incorporated in the pipeline. The user can modify the default parameters in `conf.txt` and pass it to the pipeline as an input. For example, the `data_perc` option in the configuration refers to the default value of 5% of the dataset selected for analysis. However, the user has the flexibility to provide any integer value between 1 and 100 to specify the desired percentage of the dataset to be used. Similarly, the user can choose between _blastn_ or _bowtie2_ by changing the 'execute flag' to either 0 or 1 in the configuration file while leaving the rest of the parameters to default values. By default, both the tools are enabled _ie_. `execute = 1`. The `read_drop_perc_cutoff` in `conf.txt` config file denotes the cutoff to discard a sample if the total reads left after executing the trimmomatic are higher than the threshold (by default, if the more than 70% of reads are dropped as per the trimmomatic log, then the sample will fail the quality criteria and will not be processed downstream). Please refer the documentation of [Trimmomatic ](https://github.com/usadellab/Trimmomatic) for more details about the parameters present in the config file. Similarly, the criteria to check the minimal alignment rate are indicated by the `alignment perc cutoff` parameter under blastn and bowtie2 in the `conf.txt` configuration file (if the total alignment percentage is less than the threshold then the pipeline will report that the sample failed the quality criteria). More details about the parameters used in the `conf.txt` file can be found in the respective documentations of [Blastn](https://www.ncbi.nlm.nih.gov/books/NBK279690/) and [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml). By default, the pipeline uses a pre-built Kraken2 viral genomic database ([release: 9/8/2022](https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20220908.tar.gz)) from . Users can provide their own database by changing the `kraken2_db_path` parameter in the `conf.txt` file. > _Note:_ If the user wishes to use a different installation than Bioconda, the user can manually install the required tools and specify the absolute path of the executable binaries in the configuration. --- ### Pipeline parameters - **`--input`** (mandatory) The user can provide input in either of the following ways: - A single SRA run accession. eg: **`perl ara.pl --input SRR12548227 --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa`** - A list of run accessions in a text file (1 run accession per line). eg: **`perl ara.pl --input example/list.txt --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa`** - The SRA runInfo exported directly from the NCBI-SRA web portal. Goto the [SRA homepage](https://www.ncbi.nlm.nih.gov/sra "Home - NCBI - SRA") and search for the desired keyword. Export the `SraRunInfo.csv` by clicking 'Send to' =\\> File =\\> RunInfo). eg: **`perl ara.pl --input example/SraRunInfo.csv --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa`** - **`--sequences`** (mandatory) The user should provide a fasta file containing the query sequences. - **`--output`** (optional) The output directory to store the results. By default, the output will be stored into the **`results/`** directory of the package. eg: **`perl ara.pl --input example/SraRunInfo.csv --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa --output /src/main/test/`** - **`--mode`** (optional) Choose one of the three modes to run the pipeline. - The **`screen`** is the default mode which will only download a fraction of the data-set per SRA-run accession and analyse the file as per the given configuration. - The **`full`** mode will execute the pipeline by downloading the complete fastq file per SRA-run accession. - The **`both`** option searches for samples using a fraction of the data that meet the minimum alignment cutoff from either 'bowtie2' or 'blastn', and then automatically performs alignment by downloading the entire fastq file. eg: **`perl ara.pl --input example/SraRunInfo.csv --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa --output /src/main/test/ --mode screen`** > _Note:_ There is a supporting **`summary`** mode, that will generate a unified alignment summary by examining the output files created by either screen-mode or full-mode. The summary mode should only be used when the user needs to recreate the summary stats from the pre-existing results. The user must enter **`–mode summary`** along with the previously used command parameters to re-generate the summary. - **`--config`** (optional) Pipeline configuration. By default it will use the **`conf.txt`** generated by the setup script. eg: **`perl ara.pl --input example/SraRunInfo.csv --sequences example/Arabidopsis_thaliana.TAIR10.ncrna.fa --output /src/main/test/ --mode screen --config conf.txt`** --- ### Output structure The pipeline will create folders per SRA run accession and generate results using the run accession as the prefix. The analysis related to the screening a fraction of data will be stored in `screening_results` directory whereas the analysis conducted on the whole dataset will be stored in `full_analyis_results` directory. An outline of directory structure containing the results is shown below- results/ `-- test/ (name derived from the input fasta sequence file) |-- test.screening.analysis.stats.sorted.by.alignment.txt (combined metadata and analysis report generated after processing all the SRA run accessions, sorted in decreasing order of total alignment percentage) |-- metadata/ | |-- test.metadata.txt (Combined metadata downloaded from SRA) | |-- test.metadata.screened.txt (List of SRA accessions which qualify the filter criteria specified in the config.) | |-- SRA_RUN.run.metadata.txt (unprocessed metadata on a single SRA accession as retrieved from NCBI) |-- reference/ | |-- blastn_db/ (folder containing the blast database created from the input fasta sequence) | |-- bowtie2_index/ (folder containing the bowtie index created from the input fasta sequence) | |-- bowtie2_index.stdout.txt (stdout captured from bowtie2 index creation) | `-- makeblastdb.stdout.txt (stdout captured from blastn database creation) `-- screening_results/ (similar structure for screeing or full mode) |-- SRA_RUN/ (each SRA run accession will be processed into a seperate folder) | |-- blastn/ | | |-- SRA_RUN.blast.results.txt (output from NCBI Blastn) | | `-- blast.stats.txt (blastn overall alignment stats) | |-- bowtie2/ | | |-- SRA_RUN.bam (output from bowtie2) | | |-- alignment.stats.txt (bowtie2 stdout) | | `-- alignment.txt (bowtie2 overall alignment summary) | |-- fastQC/ | | |-- | | |-- | |-- kraken2/ | | |-- SRA_RUN.kraken (kraken2 standard classification table) | | |-- SRA_RUN.report (kraken2 classification report) | | `-- SRA_RUN.stdout.txt (kraken2 stdout) | |-- raw_fastq/ | | |-- | | |-- fastq_dump.stdout.txt | | |-- sra/ | | `-- wget.full.sra.stdout.txt | `-- trimmed_data/ | |-- | `-- SRA_RUN_trim_stdout_log.txt (trimmomatic stdout) `-- runlog.SRA_RUN.txt (Complete run log of the pipeline per SRA run accession) For a thorough understanding of the results of the third-party tools, take a look at the following documentations: - [Blastn](https://www.ncbi.nlm.nih.gov/books/NBK279690/) - [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) - [Kraken2](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown) - [Trimmomatic](https://github.com/usadellab/Trimmomatic) --- ### Disk usage using the input from the example The table below provides a summary of the disk usage for different analyses conducted on varying dataset sizes. It demonstrates how disk usage can increase depending on the choice of the fraction of the dataset the user wishes to analyze. | RUN ACCESSION | 100% of dataset | 5% of dataset | 10% of dataset | | ------------- | --------------- | ------------- | -------------- | | SRR8392720 | 1.3G | 85M | 156M | | SRR7289585 | 1.4G | 150M | 288M | | SRR12548227 | 15M | 9.0M | 9.1M | This summary highlights how the disk usage (in megabytes or gigabytes) can vary depending on the chosen fraction of the dataset for analysis. --- ### Troubleshooting - Errors related to mamba/conda environment: Since `mamba` is a drop-in replacement and uses the same commands and configuration options as **conda**, it's possible to swap almost all commands between **conda** & **mamba**. Use **`conda list`** command to verify whether the packages mentioned in the `requirements.yaml` are successfully installed into your environment. > _Note:_ The `requirements.yaml` provided in this package was exported from `mamba 0.25.0` installation running on `Ubuntu 20.04.4 LTS`. In case of any missing tool/ conflicting dependencies in the environment, the user can try using **`conda search `** or `mamba repoquery search ` command to find the supported version of the tool and then manually install it by typing **`conda install `** or `mamba install ` inside the environment. Please refer the official [troubleshooting guide](https://conda.io/projects/conda/en/latest/user-guide/troubleshooting.html "User guide » Troubleshooting") for further help. > _Note:_ On macOS and Linux, the supported tools and their dependencies aren't always the same. Even when all of the requirements are completely aligned, the set of available versions isn't necessarily the same. User may try setting up the environment using any of the supplementary `requirements-*.txt` provided in the `src/main/resources/` directory. - Error installing Perl modules: Users must ensure that they have write permission to the `/Users/\\*/.cpan/` or similar directory, and the CPAN is properly configured. You might need to define the PERLLIB/PERL5LIB environment variable if you see an error similar to the following: ```bash Cant locate My/Module.pm in @INC (@INC contains: ... ... .). BEGIN failed--compilation aborted. ``` > _Note about MAKE_: 'make' is an essential tool for building Perl modules. Please make sure that you have 'make' installed in your system. The setup script provided in this package utilizes 'cpan' to build the required Perl modules automatically. If the automatic setup provided in the package fails to install the required dependencies, you may need to install them manually by using the command `cpan install ` or searching the package on [Metacpan](https://metacpan.org/). Additionally, some Perl modules can also be installed through `mamba` (eg. the compatible version of Perl module `Config::Simple` can be searched on mamba by `mamba repoquery search perl-config-simple`) --- ### List of Perl modules and tools incorporated in the pipeline - Perl modules: - Config::Simple - Parallel::ForkManager - Log::Log4perl - Getopt::Long - Text::CSV - Text::Unidecode - Tools: - [NCBI EDirect utilities \\>=16.2](https://www.ncbi.nlm.nih.gov/books/NBK179288/) - [NCBI SRA Toolkit \\>=2.10.7](https://www.ncbi.nlm.nih.gov/home/tools/) - [FastQC \\>=0.11.9](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc) - [Trimmomatic \\>=0.39](http://www.usadellab.org/cms/?page=trimmomatic) - [FASTX-Toolkit \\>=0.0.14](http://hannonlab.cshl.edu/fastx_toolkit/) - [NCBI Blast \\>=2.10.1](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) - [Bowtie2 \\>=2.4.5](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) - [Samtools \\>=1.15.1](http://www.htslib.org/download/) - [Kraken2 \\>=2.1.2](https://ccb.jhu.edu/software/kraken2/) --- """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.546.1" ; ns1:keywords "Genomics, Pipeline, Perl, ncbi sra, sequence annotation, sequence search" ; ns1:license ; ns1:name "ARA (Automated Record Analysis)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-08-01T01:34:42Z"^^ns1:Date ; ns1:dateModified "2023-08-03T18:15:31Z"^^ns1:Date ; ns1:description """# GERONIMO ## Introduction GERONIMO is a bioinformatics pipeline designed to conduct high-throughput homology searches of structural genes using covariance models. These models are based on the alignment of sequences and the consensus of secondary structures. The pipeline is built using Snakemake, a workflow management tool that allows for the reproducible execution of analyses on various computational platforms. The idea for developing GERONIMO emerged from a comprehensive search for [telomerase RNA in lower plants] and was subsequently refined through an [expanded search of telomerase RNA across Insecta]. GERONIMO can test hundreds of genomes and ensures the stability and reproducibility of the analyses performed. [telomerase RNA in lower plants]: https://doi.org/10.1093/nar/gkab545 [expanded search of telomerase RNA across Insecta]: https://doi.org/10.1093/nar/gkac1202 ## Scope The GERONIMO tool utilises covariance models (CMs) to conduct homology searches of RNA sequences across a wide range of gene families in a broad evolutionary context. Specifically, it can be utilised to: * Detect RNA sequences that share a common evolutionary ancestor * Identify and align orthologous RNA sequences among closely related species, as well as paralogous sequences within a single species * Identify conserved non-coding RNAs in a genome, and extract upstream genomic regions to characterise potential promoter regions. It is important to note that GERONIMO is a computational tool, and as such, it is intended to be run on a computer with a small amount of data. Appropriate computational infrastructure is necessary for analysing hundreds of genomes. Although GERONIMO was primarily designed for Telomerase RNA identification, its functionality extends to include the detection and alignment of other RNA gene families, including **rRNA**, **tRNA**, **snRNA**, **miRNA**, and **lncRNA**. This can aid in identifying paralogs and orthologs across different species that may carry specific functions, making it useful for phylogenetic analyses. It is crucial to remember that some gene families may exhibit similar characteristics but different functions. Therefore, analysing the data and functional annotation after conducting the search is essential to characterise the sequences properly. ## Pipeline overview By default, the GERONIMO pipeline conducts high-throughput searches of homology sequences in downloaded genomes utilizing covariance models. If a significant similarity is detected between the model and genome sequence, the pipeline extracts the upstream region, making it convenient to identify the promoter of the discovered gene. In brief, the pipeline: - Compiles a list of genomes using the NCBI's [Entrez] database based on a specified query, *e.g. "Rhodophyta"[Organism]* - Downloads and decompresses the requested genomes using *rsync* and *gunzip*, respectively - *Optionally*, generates a covariance model based on a provided alignment using [Infernal] - Conducts searches among the genomes using the covariance model [Infernal] - Supplements genome information with taxonomy data using [rentrez] - Expands the significant hits sequence by extracting upstream genomic regions using [*blastcmd*] - Compiles the results, organizes them into a tabular format, and generates a visual summary of the performed analysis. [Entrez]: https://www.ncbi.nlm.nih.gov/books/NBK179288/ [Infernal]: http://eddylab.org/infernal/ [rentrez]: https://github.com/ropensci/rentrez [*blastcmd*]: https://www.ncbi.nlm.nih.gov/books/NBK569853/ ## Quick start The GERONIMO is available as a `snakemake pipeline` running on Linux and Windows operating systems. ### Windows 10 Instal Linux on Windows 10 (WSL) according to [instructions], which bottling down to opening PowerShell or Windows Command Prompt in *administrator mode* and pasting the following: ```shell wsl --install wsl.exe --install UBUNTU ``` Then restart the machine and follow the instructions for setting up the Linux environment. [instructions]: https://learn.microsoft.com/en-us/windows/wsl/install ### Linux: #### Check whether the conda is installed: ```shell conda -V ``` > GERONIMO was tested on conda 23.3.1 #### 1) If you do not have installed `conda`, please install `miniconda` Please follow the instructions for installing [miniconda] [miniconda]: https://conda.io/projects/conda/en/stable/user-guide/install/linux.html #### 2) Continue with installing `mamba` (recommended but optional) ```shell conda install -n base -c conda-forge mamba ``` #### 3) Install `snakemake` ```shell conda activate base mamba create -p env_snakemake -c conda-forge -c bioconda snakemake mamba activate env_snakemake snakemake --help ``` In case of complications, please check the section `Questions & Answers` below or follow the [official documentation] for troubleshooting. [official documentation]: https://snakemake.readthedocs.io/en/stable/getting_started/installation.html ### Clone the GERONIMO repository Go to the path in which you want to run the analysis and clone the repository: ```shell cd git clone https://github.com/amkilar/GERONIMO.git ``` ### Run sample analysis to ensure GERONIMO installation was successful All files are prepared for the sample analysis as a default. Please execute the line below: ```shell snakemake -s GERONIMO.sm --cores 1 --use-conda results/summary_table.xlsx ``` This will prompt GERONIMO to quickly scan all modules, verifying the correct setup of the pipeline without executing any analysis. You should see the message `Building DAG of jobs...`, followed by `Nothing to be done (all requested files are present and up to date).`, when successfully completed. If you want to run the sample analysis fully, please remove the folder `results` from the GERONIMO directory and execute GERONIMO again with: `snakemake -s GERONIMO.sm --cores 1 --use-conda results/summary_table.xlsx` > You might consider allowing more cores to speed up the analysis, which might take up to several hours. #### You might want to clean `GERONIMO/` directory from the files produced by the example analysis. You can safely remove the following: - `GERONIMO/results` - `GERONIMO/database` - `GERONIMO/taxonomy` - `GERONIMO/temp` - `.create_genome_list.touch` - `list_of_genomes.txt` ## Setup the inputs ### 1) Prepare the `covariance models`: #### Browse the collection of available `covariance models` at [Rfam] (*You can find the covariance model in the tab `Curation`.*) Paste the covariance model to the folder `GERONIMO/models` and ensure its name follows the convention: `cov_model_` [Rfam]: https://rfam.org/ #### **OR** #### Prepare your own `covariance model` using [LocARNA] 1. Paste or upload your sequences to the web server and download the `.stk` file with the alignment result. > *Please note that the `.stk` file format is crucial for the analysis, containing sequence alignment and secondary structure consensus.* > The LocARNA web service allows you to align 30 sequences at once - if you need to align more sequences, please use the standalone version available [here] > After installation run: ```shell mlocarna my_fasta_sequences.fasta ``` 2. Paste the `.stk` alignment file to the folder `GERONIMO/model_to_build` and ensure its name follows the convention: `.stk` > Please check the example `heterotrichea.stk` format in `GERONIMO/models_to_built` for reference [LocARNA]: http://rna.informatik.uni-freiburg.de/LocARNA/Input.jsp [here]: http://www.bioinf.uni-freiburg.de/Software/LocARNA/ ### 2) Adjust the `config.yaml` file Please adjust the analysis specifications, as in the following example: > - database: ' [Organism]' (in case of difficulties with defining the database query, please follow the instructions below) > - extract_genomic_region-length: (here you can determine how long the upstream genomic region should be extracted; tested for 200) > - models: ["", ""] (here specify the names of models that should be used to perform analysis) > > *Here you can also insert the name of the covariance model you want to build with GERONIMO - just be sure you placed `.stk` file in `GERONIMO/models_to_build` before starting analysis* > - CPU_for_model_building: (specify the number of available CPUs devoted to the process of building model (cannot exceed the CPU number allowed to snakemake with `--cores`) > > *You might ignore this parameter when you do not need to create a new covariance model* Keep in mind that the covariance models and alignments must be present in the respective GERONIMO folders. ### 3) Remove folder `results`, which contains example analysis output ### 4) **Please ensure you have enough storage capacity to download all the requested genomes (in the `GERONIMO/` directory)** ## Run GERONIMO ```shell mamba activate env_snakemake cd ~/GERONIMO snakemake -s GERONIMO.sm --cores --use-conda results/summary_table.xlsx ``` ## Example results ### Outputs characterisation #### A) Summary table The Excel table contains the results arranged by taxonomy information and hit significance. The specific columns include: * family, organism_name, class, order, phylum (taxonomy context) * GCA_id - corresponds to the genome assembly in the *NCBI database* * model - describes which covariance model identified the result * label - follows the *Infernal* convention of categorizing hits * number - the counter of the result * e_value - indicates the significance level of the hit * HIT_sequence - the exact HIT sequence found by *Infernal*, which corresponds to the covariance model * HIT_ID - describes in which part of the genome assembly the hit was found, which may help publish novel sequences * extended_genomic_region - upstream sequence, which may contain a possible promoter sequence * secondary_structure - the secondary structure consensus of the covariance model #### B) Significant Hits Distribution Across Taxonomy Families The plot provides an overview of the number of genomes in which at least one significant hit was identified, grouped by family. The bold black line corresponds to the number of genomes present in each family, helping to minimize bias regarding unequal data representation across the taxonomy. #### C) Hits Distribution in Genomes Across Families The heatmap provides information about the most significant hits from the genome, identified by a specific covariance model. Genomes are grouped by families (on the right). Hits are classified into three categories based on their e-values. Generally, these categories correspond to hit classifications ("HIT," "MAYBE," "NO HIT"). The "HIT" category is further divided to distinguish between highly significant hits and moderately significant ones. ### GERONIMO directory structure The GERONIMO directory structure is designed to produce files in a highly structured manner, ensuring clear insight and facilitating the analysis of results. During a successful run, GERONIMO produces the following folders: * `/database` - which contains genome assemblies that were downloaded from the *NCBI database* and grouped in subfolders * `/taxonomy` - where taxonomy information is gathered and stored in the form of tables * `/results` - the main folder containing all produced results: * `/infernal_raw` - contains the raw results produced by *Infernal* * `/infernal` - contains restructured results of *Infernal* in table format * `/cmdBLAST` - contains results of *cmdblast*, which extracts the extended genomic region * `/summary` - contains summary files that join results from *Infernal*, *cmdblast*, and attach taxonomy context * `/plots` - contains two types of summary plots * `/temp` - folder contains the information necessary to download genome assemblies from *NCBI database* * `/env` - stores instructions for dependency installation * `/models` - where calibrated covariance models can be pasted, *for example, from the Rfam database* * `/modes_to_built` - where multiple alignments in *.stk* format can be pasted * `/scripts` - contains developed scripts that perform results structurization #### The example GERONIMO directory structure: ```shell GERONIMO ├── database │   ├── GCA_000091205.1_ASM9120v1_genomic │   ├── GCA_000341285.1_ASM34128v1_genomic │   ├── GCA_000350225.2_ASM35022v2_genomic │   └── ... ├── env ├── models ├── model_to_build ├── results │   ├── cmdBLAST │   │   ├── MRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   └── ... │   │   ├── SRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   │   ├── extended │   │   │   │   └── filtered │   │   │   └── ... │   │   ├── ... │   ├── infernal │   │   ├── MRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   ├── ... │   │   ├── SRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   ├── ... │   ├── plots │   ├── raw_infernal │   │   ├── MRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   ├── ... │   │   ├── SRP │   │   │   ├── GCA_000091205.1_ASM9120v1_genomic │   │   │   ├── GCA_000341285.1_ASM34128v1_genomic │   │   │   ├── GCA_000350225.2_ASM35022v2_genomic │   │   │   ├── ... │   └── summary │   ├── GCA_000091205.1_ASM9120v1_genomic │   ├── GCA_000341285.1_ASM34128v1_genomic │   ├── GCA_000350225.2_ASM35022v2_genomic │   ├── ... ├── scripts ├── taxonomy └── temp ``` ## GERONIMO applicability ### Expanding the evolutionary context To add new genomes or database queries to an existing analysis, please follow the instructions: 1) Rename the `list_of_genomes.txt` file to `previous_list_of_genomes.txt` or any other preferred name. 2) Modify the `config.yaml` file by replacing the previous database query with the new one. 3) Delete: - `summary_table.xlsx`, `part_summary_table.csv`, `summary_table_models.xlsx` files located in the `GERONIMO\\results` directory - `.create_genome_list.touch` file 5) Run GERONIMO to calculate new results using the command: ```shell snakemake -s GERONIMO.sm --cores --use-conda results/summary_table.xlsx ``` 7) Once the new results are generated, reviewing them before merging them with the original results is recommended. 8) Copy the contents of the `previous_list_of_genomes.txt` file and paste them into the current `list_of_genomes.txt`. 9) Delete: - `summary_table.xlsx` located in the `GERONIMO\\results` directory - `.create_genome_list.touch` file 10) Run GERONIMO to merge the results from both analyses using the command: ```shell snakemake -s GERONIMO.sm --cores 1 --use-conda results/summary_table.xlsx ``` ### Incorporating new covariance models into existing analysis 1) Copy the new covariance model to `GERONIMO/models` 2) Modify the `config.yaml` file by adding the name of the new model to the line `models: [...]` 3) Run GERONIMO to see the updated analysis outcome ### Building a new covariance model With GERONIMO, building a new covariance model from multiple sequence alignment in the `.stk` format is possible. To do so, simply paste `.stk` file to `GERONIMO/models_to_build` and paste the name of the new covariance model to `config.yaml` file to the line `models: [""]` and run GERONIMO. ## Questions & Answers ### How to specify the database query? - Visit the [NCBI Assemblies] website. - Follow the instruction on the graphic below: [NCBI Assemblies]: https://www.ncbi.nlm.nih.gov/assembly/?term= ### WSL: problem with creating `snakemake_env` In the case of an error similar to the one below: > CondaError: Unable to create prefix directory '/mnt/c/Windows/system32/env_snakemake'. > Check that you have sufficient permissions. You might try to delete the cache with: `rm -r ~/.cache/` and try again. ### When `snakemake` does not seem to be installed properly In the case of the following error: > Command 'snakemake' not found ... Check whether the `env_snakemake` is activated. > It should result in a change from (base) to (env_snakemake) before your login name in the command line window. If you still see `(base)` before your login name, please try to activate the environment with conda: `conda activate env_snakemake` Please note that you might need to specify the full path to the `env_snakemake`, like /home/your user name/env_snakemake ### How to browse GERONIMO results obtained in WSL? You can easily access the results obtained on WSL from your Windows environment by opening `File Explorer` and pasting the following line into the search bar: `\\\\wsl.localhost\\Ubuntu\\home\\`. This will reveal a folder with your username, as specified during the configuration of your Ubuntu system. To locate the GERONIMO results, simply navigate to the folder with your username and then to the `home` folder. (`\\\\wsl.localhost\\Ubuntu\\home\\\\home\\GERONIMO`) ### GERONIMO occupies a lot of storage space Through genome downloads, GERONIMO can potentially consume storage space, rapidly leading to a shortage. Currently, downloading genomes is an essential step for optimal GERONIMO performance. Regrettably, if the analysis is rerun without the `/database` folder, it will result in the need to redownload genomes, which is a highly time-consuming process. Nevertheless, if you do not intend to repeat the analysis and have no requirement for additional genomes or models, you are welcome to retain your results tables and plots while removing the remaining files. It is strongly advised against using local machines for extensive analyses. If you lack access to external storage space, it is recommended to divide the analysis into smaller segments, which can be later merged, as explained in the section titled `Expanding the evolutionary context`. Considering this limitation, I am currently working on implementing a solution that will help circumvent the need for redundant genome downloads without compromising GERONIMO performance in the future. You might consider deleting the `.snakemake` folder to free up storage space. However, please note that deleting this folder will require the reinstallation of GERONIMO dependencies when the analysis is rerun. ## License Copyright (c) 2023 Agata M. Kilar Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ## Contact mgr inż. Agata Magdalena Kilar, PhD (agata.kilar@ceitec.muni.cz) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.547.1" ; ns1:image ; ns1:keywords "Bioinformatics, Snakemake, rna" ; ns1:license ; ns1:name "GERONIMO" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T14:40:24Z"^^ns1:Date ; ns1:dateModified "2026-03-25T14:42:03Z"^^ns1:Date ; ns1:description """# Protein Conformational Transitions calculations tutorial using BioExcel Building Blocks (biobb) and GOdMD This tutorial aims to illustrate the process of computing a conformational transition between two known structural conformations of a protein, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.548.4" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein Conformational Transitions calculations tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_molecules_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_aln_orig_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_aln_target_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_ene_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_trj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T14:42:14Z"^^ns1:Date ; ns1:dateModified "2026-03-25T14:45:12Z"^^ns1:Date ; ns1:description """# Protein Conformational Transitions calculations tutorial using BioExcel Building Blocks (biobb) and GOdMD This tutorial aims to illustrate the process of computing a conformational transition between two known structural conformations of a protein, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.549.2" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Protein Conformational Transitions calculations tutorial" ; ns1:output , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-03-24T12:27:42Z"^^ns1:Date ; ns1:dateModified "2026-03-24T12:33:12Z"^^ns1:Date ; ns1:description """# Automatic Ligand parameterization tutorial using BioExcel Building Blocks (biobb) *** This tutorial aims to illustrate the process of **ligand parameterization** for a **small molecule**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **Sulfasalazine** protein (3-letter code SAS), used to treat rheumatoid arthritis, ulcerative colitis, and Crohn's disease. **OpenBabel and ACPype** packages are used to **add hydrogens, energetically minimize the structure**, and **generate parameters** for the **GROMACS** package. With *Generalized Amber Force Field (GAFF) forcefield and AM1-BCC* charges. *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.54.8" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook GMX Notebook Automatic Ligand Parameterization tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 8 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T14:45:21Z"^^ns1:Date ; ns1:dateModified "2026-03-25T14:47:00Z"^^ns1:Date ; ns1:description """# Protein Conformational Transitions calculations tutorial using BioExcel Building Blocks (biobb) and GOdMD This tutorial aims to illustrate the process of computing a conformational transition between two known structural conformations of a protein, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.550.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Protein Conformational Transitions calculations tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T12:37:01Z"^^ns1:Date ; ns1:dateModified "2026-03-25T12:52:52Z"^^ns1:Date ; ns1:description """# Macromolecular Coarse-Grained Flexibility (FlexServ) tutorial using BioExcel Building Blocks (biobb) This tutorial aims to illustrate the process of generating protein conformational ensembles from 3D structures and analysing its molecular flexibility, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.551.4" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Macromolecular Coarse-Grained Flexibility tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Config file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_structure_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_dat_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pdb_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_json_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_crd_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_log_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_cpptraj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_traj_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pcz_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pcz_path" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_pcz_path" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T12:39:55Z"^^ns1:Date ; ns1:dateModified "2026-03-25T12:47:05Z"^^ns1:Date ; ns1:description """# Macromolecular Coarse-Grained Flexibility (FlexServ) tutorial using BioExcel Building Blocks (biobb) This tutorial aims to illustrate the process of generating protein conformational ensembles from 3D structures and analysing its molecular flexibility, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.552.2" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "CWL Macromolecular Coarse-Grained Flexibility tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-25T12:47:21Z"^^ns1:Date ; ns1:dateModified "2026-03-25T12:49:26Z"^^ns1:Date ; ns1:description """# Macromolecular Coarse-Grained Flexibility (FlexServ) tutorial using BioExcel Building Blocks (biobb) This tutorial aims to illustrate the process of generating protein conformational ensembles from 3D structures and analysing its molecular flexibility, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.553.3" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Python Macromolecular Coarse-Grained Flexibility tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-08-02T11:41:06Z"^^ns1:Date ; ns1:dateModified "2023-08-02T11:41:06Z"^^ns1:Date ; ns1:description """[![Snakemake](https://img.shields.io/badge/snakemake-≥7.0.0-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io) # About SnakeMAGs SnakeMAGs is a workflow to reconstruct prokaryotic genomes from metagenomes. The main purpose of SnakeMAGs is to process Illumina data from raw reads to metagenome-assembled genomes (MAGs). SnakeMAGs is efficient, easy to handle and flexible to different projects. The workflow is CeCILL licensed, implemented in Snakemake (run on multiple cores) and available for Linux. SnakeMAGs performed eight main steps: - Quality filtering of the reads - Adapter trimming - Filtering of the host sequences (optional) - Assembly - Binning - Evaluation of the quality of the bins - Classification of the MAGs - Estimation of the relative abundance of the MAGs ![scheme of workflow](SnakeMAGs_schema.jpg?raw=true) # How to use SnakeMAGs ## Install conda The easiest way to install and run SnakeMAGs is to use [conda](https://www.anaconda.com/products/distribution). These package managers will help you to easily install [Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html). ## Install and activate Snakemake environment Note: The workflow was developed with Snakemake 7.0.0 ``` conda activate # First, set up your channel priorities conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge # Then, create a new environment for the Snakemake version you require conda create -n snakemake_7.0.0 snakemake=7.0.0 # And activate it conda activate snakemake_7.0.0 ``` Alternatively, you can also install Snakemake via mamba: ``` # If you do not have mamba yet on your machine, you can install it with: conda install -n base -c conda-forge mamba # Then you can install Snakemake conda activate base mamba create -c conda-forge -c bioconda -n snakemake snakemake # And activate it conda activate snakemake ``` ## SnakeMAGs executable The easiest way to procure SnakeMAGs and its related files is to clone the repository using git: ``` git clone https://github.com/Nachida08/SnakeMAGs.git ``` Alternatively, you can download the relevant files: ``` wget https://github.com/Nachida08/SnakeMAGs/blob/main/SnakeMAGs.smk https://github.com/Nachida08/SnakeMAGs/blob/main/config.yaml ``` ## SnakeMAGs input files - Illumina paired-end reads in FASTQ. - Adapter sequence file ([adapter.fa](https://github.com/Nachida08/SnakeMAGs/blob/main/adapters.fa)). - Host genome sequences in FASTA (if host_genome: "yes"), in case you work with host-associated metagenomes (e.g. human gut metagenome). ## Download Genome Taxonomy Database (GTDB) GTDB-Tk requires ~66G+ of external data (GTDB) that need to be downloaded and unarchived. Because this database is voluminous, we let you decide where you want to store it. SnakeMAGs do not download automatically GTDB, you have to do it: ``` #Download the latest release (tested with release207) #Note: SnakeMAGs uses GTDBtk v2.1.0 and therefore require release 207 as minimum version. See https://ecogenomics.github.io/GTDBTk/installing/index.html#installing for details. wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_v2_data.tar.gz #Decompress tar -xzvf *tar.gz #This will create a folder called release207_v2 ``` All you have to do now is to indicate the path to the database folder (in our example, the folder is called release207_v2) in the config file, Classification section. ## Download the GUNC database (required if gunc: "yes") GUNC accepts either a progenomes or GTDB based reference database. Both can be downloaded using the ```gunc download_db``` command. For our study we used the default proGenome-derived GUNC database. It requires less resources with similar performance. ``` conda activate # Install and activate GUNC environment conda create --prefix /path/to/gunc_env conda install -c bioconda metabat2 --prefix /path/to/gunc_env source activate /path/to/gunc_env #Download the proGenome-derived GUNC database (tested with gunc_db_progenomes2.1) #Note: SnakeMAGs uses GUNC v1.0.5 gunc download_db -db progenomes /path/to/GUNC_DB ``` All you have to do now is to indicate the path to the GUNC database file in the config file, Bins quality section. ## Edit config file You need to edit the config.yaml file. In particular, you need to set the correct paths: for the working directory, to specify where are your fastq files, where you want to place the conda environments (that will be created using the provided .yaml files available in [SnakeMAGs_conda_env directory](https://github.com/Nachida08/SnakeMAGs/tree/main/SnakeMAGs_conda_env)), where are the adapters, where is GTDB and optionally where is the GUNC database and where is your host genome reference. Lastly, you need to allocate the proper computational resources (threads, memory) for each of the main steps. These can be optimized according to your hardware. Here is an example of a config file: ``` ##################################################################################################### ##### _____ ___ _ _ _ ______ __ __ _______ _____ ##### ##### / ___| | \\ | | /\\ | | / / | ____| | \\ / | /\\ / _____| / ___| ##### ##### | (___ | |\\ \\ | | / \\ | |/ / | |____ | \\/ | / \\ | | __ | (___ ##### ##### \\___ \\ | | \\ \\| | / /\\ \\ | |\\ \\ | ____| | |\\ /| | / /\\ \\ | | |_ | \\___ \\ ##### ##### ____) | | | \\ | / /__\\ \\ | | \\ \\ | |____ | | \\/ | | / /__\\ \\ | |____|| ____) | ##### ##### |_____/ |_| \\__| /_/ \\_\\ |_| \\_\\ |______| |_| |_| /_/ \\_\\ \\______/ |_____/ ##### ##### ##### ##################################################################################################### ############################ ### Execution parameters ### ############################ working_dir: /path/to/working/directory/ #The main directory for the project raw_fastq: /path/to/raw_fastq/ #The directory that contains all the fastq files of all the samples (eg. sample1_R1.fastq & sample1_R2.fastq, sample2_R1.fastq & sample2_R2.fastq...) suffix_1: "_R1.fastq" #Main type of suffix for forward reads file (eg. _1.fastq or _R1.fastq or _r1.fastq or _1.fq or _R1.fq or _r1.fq ) suffix_2: "_R2.fastq" #Main type of suffix for reverse reads file (eg. _2.fastq or _R2.fastq or _r2.fastq or _2.fq or _R2.fq or _r2.fq ) ########################### ### Conda environnemnts ### ########################### conda_env: "/path/to/SnakeMAGs_conda_env/" #Path to the provided SnakeMAGs_conda_env directory which contains the yaml file for each conda environment ######################### ### Quality filtering ### ######################### email: name.surname@your-univ.com #Your e-mail address threads_filter: 10 #The number of threads to run this process. To be adjusted according to your hardware resources_filter: 150 #Memory according to tools need (in GB) ######################## ### Adapter trimming ### ######################## adapters: /path/to/working/directory/adapters.fa #A fasta file contanning a set of various Illumina adaptors (this file is provided and is also available on github) trim_params: "2:40:15" #For further details, see the Trimmomatic documentation threads_trim: 10 #The number of threads to run this process. To be adjusted according to your hardware resources_trim: 150 #Memory according to tools need (in GB) ###################### ### Host filtering ### ###################### host_genome: "yes" #yes or no. An optional step for host-associated samples (eg. termite, human, plant...) threads_bowtie2: 50 #The number of threads to run this process. To be adjusted according to your hardware host_genomes_directory: /path/to/working/host_genomes/ #the directory where the host genome is stored host_genomes: /path/to/working/host_genomes/host_genomes.fa #A fasta file containing the DNA sequences of the host genome(s) threads_samtools: 50 #The number of threads to run this process. To be adjusted according to your hardware resources_host_filtering: 150 #Memory according to tools need (in GB) ################ ### Assembly ### ################ threads_megahit: 50 #The number of threads to run this process. To be adjusted according to your hardware min_contig_len: 1000 #Minimum length (in bp) of the assembled contigs k_list: "21,31,41,51,61,71,81,91,99,109,119" #Kmer size (for further details, see the megahit documentation) resources_megahit: 250 #Memory according to tools need (in GB) ############### ### Binning ### ############### threads_bwa: 50 #The number of threads to run this process. To be adjusted according to your hardware resources_bwa: 150 #Memory according to tools need (in GB) threads_samtools: 50 #The number of threads to run this process. To be adjusted according to your hardware resources_samtools: 150 #Memory according to tools need (in GB) seed: 19860615 #Seed number for reproducible results threads_metabat: 50 #The number of threads to run this process. To be adjusted according to your hardware minContig: 2500 #Minimum length (in bp) of the contigs resources_binning: 250 #Memory according to tools need (in GB) #################### ### Bins quality ### #################### #checkM threads_checkm: 50 #The number of threads to run this process. To be adjusted according to your hardware resources_checkm: 250 #Memory according to tools need (in GB) #bins_quality_filtering completion: 50 #The minimum completion rate of bins contamination: 10 #The maximum contamination rate of bins parks_quality_score: "yes" #yes or no. If yes bins are filtered according to the Parks quality score (completion-5*contamination >= 50) #GUNC gunc: "yes" #yes or no. An optional step to detect and discard chimeric and contaminated genomes using the GUNC tool threads_gunc: 50 #The number of threads to run this process. To be adjusted according to your hardware resources_gunc: 250 #Memory according to tools need (in GB) GUNC_db: /path/to/GUNC_DB/gunc_db_progenomes2.1.dmnd #Path to the downloaded GUNC database (see the readme file) ###################### ### Classification ### ###################### GTDB_data_ref: /path/to/downloaded/GTDB #Path to uncompressed GTDB-Tk reference data (GTDB) threads_gtdb: 10 #The number of threads to run this process. To be adjusted according to your hardware resources_gtdb: 250 #Memory according to tools need (in GB) ################## ### Abundances ### ################## threads_coverM: 10 #The number of threads to run this process. To be adjusted according to your hardware resources_coverM: 150 #Memory according to tools need (in GB) ``` # Run SnakeMAGs If you are using a workstation with Ubuntu (tested on Ubuntu 22.04): ```{bash} snakemake --cores 30 --snakefile SnakeMAGs.smk --use-conda --conda-prefix /path/to/SnakeMAGs_conda_env/ --configfile /path/to/config.yaml --keep-going --latency-wait 180 ``` If you are working on a cluster with Slurm (tested with version 18.08.7): ```{bash} snakemake --snakefile SnakeMAGs.smk --cluster 'sbatch -p --mem -c -o "cluster_logs/{wildcards}.{rule}.{jobid}.out" -e "cluster_logs/{wildcards}.{rule}.{jobid}.err" ' --jobs --use-conda --conda-frontend conda --conda-prefix /path/to/SnakeMAGs_conda_env/ --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 180 --configfile /path/to/config.yaml --keep-going ``` If you are working on a cluster with SGE (tested with version 8.1.9): ```{bash} snakemake --snakefile SnakeMAGs.smk --cluster "qsub -cwd -V -q -pe thread {threads} -e cluster_logs/{rule}.e{jobid} -o cluster_logs/{rule}.o{jobid}" --jobs --use-conda --conda-frontend conda --conda-prefix /path/to/SnakeMAGs_conda_env/ --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 180 --configfile /path/to/config.yaml --keep-going ``` # Test We provide you a small data set in the [test](https://github.com/Nachida08/SnakeMAGs/tree/main/test) directory which will allow you to validate your instalation and take your first steps with SnakeMAGs. This data set is a subset from [ZymoBiomics Mock Community](https://www.zymoresearch.com/blogs/blog/zymobiomics-microbial-standards-optimize-your-microbiomics-workflow) (250K reads) used in this tutoriel [metagenomics_tutorial](https://github.com/pjtorres/metagenomics_tutorial). 1. Before getting started make sure you have cloned the SnakeMAGs repository or you have downloaded all the necessary files (SnakeMAGs.smk, config.yaml, chr19.fa.gz, insub732_2_R1.fastq.gz, insub732_2_R2.fastq.gz). See the [SnakeMAGs executable](#snakemags-executable) section. 2. Unzip the fastq files and the host sequences file. ``` gunzip fastqs/insub732_2_R1.fastq.gz fastqs/insub732_2_R2.fastq.gz host_genomes/chr19.fa.gz ``` 3. For better organisation put all the read files in the same directory (eg. fastqs) and the host sequences file in a separate directory (eg. host_genomes) 4. Edit the config file (see [Edit config file](#edit-config-file) section) 5. Run the test (see [Run SnakeMAGs](#run-snakemags) section) Note: the analysis of these files took 1159.32 secondes to complete on a Ubuntu 22.04 LTS with an Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz x 40 processor, 96GB of RAM. # Genome reference for host reads filtering For host-associated samples, one can remove host sequences from the metagenomic reads by mapping these reads against a reference genome. In the case of termite gut metagenomes, we are providing [here](https://zenodo.org/record/6908287#.YuAdFXZBx8M) the relevant files (fasta and index files) from termite genomes. Upon request, we can help you to generate these files for your own reference genome and make them available to the community. NB. These steps of mapping generate voluminous files such as .bam and .sam. Depending on your disk space, you might want to delete these files after use. # Use case During the test phase of the development of SnakeMAGs, we used this workflow to process 10 publicly available termite gut metagenomes generated by Illumina sequencing, to ultimately reconstruct prokaryotic MAGs. These metagenomes were retrieved from the NCBI database using the following accession numbers: SRR10402454; SRR14739927; SRR8296321; SRR8296327; SRR8296329; SRR8296337; SRR8296343; DRR097505; SRR7466794; SRR7466795. They come from five different studies: Waidele et al, 2019; Tokuda et al, 2018; Romero Victorica et al, 2020; Moreira et al, 2021; and Calusinska et al, 2020. ## Download the Illumina pair-end reads We use fasterq-dump tool to extract data in FASTQ-format from SRA-accessions. It is a commandline-tool which offers a faster solution for downloading those large files. ``` # Install and activate sra-tools environment ## Note: For this study we used sra-tools 2.11.0 conda activate conda install -c bioconda sra-tools conda activate sra-tools # Download fastqs in a single directory mkdir raw_fastq cd raw_fastq fasterq-dump --threads --skip-technical --split-3 ``` ## Download Genome reference for host reads filtering ``` mkdir host_genomes cd host_genomes wget https://zenodo.org/record/6908287/files/termite_genomes.fasta.gz gunzip termite_genomes.fasta.gz ``` ## Edit the config file See [Edit config file](#edit-config-file) section. ## Run SnakeMAGs ``` conda activate snakemake_7.0.0 mkdir cluster_logs snakemake --snakefile SnakeMAGs.smk --cluster 'sbatch -p --mem -c -o "cluster_logs/{wildcards}.{rule}.{jobid}.out" -e "cluster_logs/{wildcards}.{rule}.{jobid}.err" ' --jobs --use-conda --conda-frontend conda --conda-prefix /path/to/SnakeMAGs_conda_env/ --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 180 --configfile /path/to/config.yaml --keep-going ``` ## Study results The MAGs reconstructed from each metagenome and their taxonomic classification are available in this [repository](https://doi.org/10.5281/zenodo.7661004). # Citations If you use SnakeMAGs, please cite: > Tadrent N, Dedeine F and Hervé V. SnakeMAGs: a simple, efficient, flexible and scalable workflow to reconstruct prokaryotic genomes from metagenomes [version 2; peer review: 2 approved]. F1000Research 2023, 11:1522 (https://doi.org/10.12688/f1000research.128091.2) Please also cite the dependencies: - [Snakemake](https://doi.org/10.12688/f1000research.29032.2) : Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., Nahnsen, S., & Köster, J. (2021) Sustainable data analysis with Snakemake [version 2; peer review: 2 approved]. *F1000Research* 2021, 10:33. - [illumina-utils](https://doi.org/10.1371/journal.pone.0066643) : Murat Eren, A., Vineis, J. H., Morrison, H. G., & Sogin, M. L. (2013). A Filtering Method to Generate High Quality Short Reads Using Illumina Paired-End Technology. *PloS ONE*, 8(6), e66643. - [Trimmomatic](https://doi.org/10.1093/bioinformatics/btu170) : Bolger, A. M., Lohse, M., & Usadel, B. (2014). Genome analysis Trimmomatic: a flexible trimmer for Illumina sequence data. *Bioinformatics*, 30(15), 2114-2120. - [Bowtie2](https://doi.org/10.1038/nmeth.1923) : Langmead, B., & Salzberg, S. L. (2012). Fast gapped-read alignment with Bowtie 2. *Nature Methods*, 9(4), 357–359. - [SAMtools](https://doi.org/10.1093/bioinformatics/btp352) : Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., Marth, G., Abecasis, G., & Durbin, R. (2009). The Sequence Alignment/Map format and SAMtools. *Bioinformatics*, 25(16), 2078–2079. - [BEDtools](https://doi.org/10.1093/bioinformatics/btq033) : Quinlan, A. R., & Hall, I. M. (2010). BEDTools: A flexible suite of utilities for comparing genomic features. *Bioinformatics*, 26(6), 841–842. - [MEGAHIT](https://doi.org/10.1093/bioinformatics/btv033) : Li, D., Liu, C. M., Luo, R., Sadakane, K., & Lam, T. W. (2015). MEGAHIT: An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph. *Bioinformatics*, 31(10), 1674–1676. - [bwa](https://doi.org/10.1093/bioinformatics/btp324) : Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. *Bioinformatics*, 25(14), 1754–1760. - [MetaBAT2](https://doi.org/10.7717/peerj.7359) : Kang, D. D., Li, F., Kirton, E., Thomas, A., Egan, R., An, H., & Wang, Z. (2019). MetaBAT 2: An adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies. *PeerJ*, 2019(7), 1–13. - [CheckM](https://doi.org/10.1101/gr.186072.114) : Parks, D. H., Imelfort, M., Skennerton, C. T., Hugenholtz, P., & Tyson, G. W. (2015). CheckM: Assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. *Genome Research*, 25(7), 1043–1055. - [GTDB-Tk](https://doi.org/10.1093/BIOINFORMATICS/BTAC672) : Chaumeil, P.-A., Mussig, A. J., Hugenholtz, P., Parks, D. H. (2022). GTDB-Tk v2: memory friendly classification with the genome taxonomy database. *Bioinformatics*. - [CoverM](https://github.com/wwood/CoverM) - [Waidele et al, 2019](https://doi.org/10.1101/526038) : Waidele, L., Korb, J., Voolstra, C. R., Dedeine, F., & Staubach, F. (2019). Ecological specificity of the metagenome in a set of lower termite species supports contribution of the microbiome to adaptation of the host. *Animal Microbiome*, 1(1), 1–13. - [Tokuda et al, 2018](https://doi.org/10.1073/pnas.1810550115) : Tokuda, G., Mikaelyan, A., Fukui, C., Matsuura, Y., Watanabe, H., Fujishima, M., & Brune, A. (2018). Fiber-associated spirochetes are major agents of hemicellulose degradation in the hindgut of wood-feeding higher termites. *Proceedings of the National Academy of Sciences of the United States of America*, 115(51), E11996–E12004. - [Romero Victorica et al, 2020](https://doi.org/10.1038/s41598-020-60850-5) : Romero Victorica, M., Soria, M. A., Batista-García, R. A., Ceja-Navarro, J. A., Vikram, S., Ortiz, M., Ontañon, O., Ghio, S., Martínez-Ávila, L., Quintero García, O. J., Etcheverry, C., Campos, E., Cowan, D., Arneodo, J., & Talia, P. M. (2020). Neotropical termite microbiomes as sources of novel plant cell wall degrading enzymes. *Scientific Reports*, 10(1), 1–14. - [Moreira et al, 2021](https://doi.org/10.3389/fevo.2021.632590) : Moreira, E. A., Persinoti, G. F., Menezes, L. R., Paixão, D. A. A., Alvarez, T. M., Cairo, J. P. L. F., Squina, F. M., Costa-Leonardo, A. M., Rodrigues, A., Sillam-Dussès, D., & Arab, A. (2021). Complementary contribution of Fungi and Bacteria to lignocellulose digestion in the food stored by a neotropical higher termite. *Frontiers in Ecology and Evolution*, 9(April), 1–12. - [Calusinska et al, 2020](https://doi.org/10.1038/s42003-020-1004-3) : Calusinska, M., Marynowska, M., Bertucci, M., Untereiner, B., Klimek, D., Goux, X., Sillam-Dussès, D., Gawron, P., Halder, R., Wilmes, P., Ferrer, P., Gerin, P., Roisin, Y., & Delfosse, P. (2020). Integrative omics analysis of the termite gut system adaptation to Miscanthus diet identifies lignocellulose degradation enzymes. *Communications Biology*, 3(1), 1–12. - [Orakov et al, 2021](https://doi.org/10.1186/s13059-021-02393-0) : Orakov, A., Fullam, A., Coelho, L. P., Khedkar, S., Szklarczyk, D., Mende, D. R., Schmidt, T. S. B., & Bork, P. (2021). GUNC: detection of chimerism and contamination in prokaryotic genomes. *Genome Biology*, 22(1). - [Parks et al, 2015](https://doi.org/10.1101/gr.186072.114) : Parks, D. H., Imelfort, M., Skennerton, C. T., Hugenholtz, P., & Tyson, G. W. (2015). CheckM: Assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. *Genome Research*, 25(7), 1043–1055. # License This project is licensed under the CeCILL License - see the [LICENSE](https://github.com/Nachida08/SnakeMAGs/blob/main/LICENCE) file for details. Developed by Nachida Tadrent at the Insect Biology Research Institute ([IRBI](https://irbi.univ-tours.fr/)), under the supervision of Franck Dedeine and Vincent Hervé. """ ; ns1:image ; ns1:keywords "Bioinformatics, Metagenomics, binning, MAG" ; ns1:license ; ns1:name "SnakeMAGs: a simple, efficient, flexible and scalable workflow to reconstruct prokaryotic genomes from metagenomes" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-11-01T11:50:02Z"^^ns1:Date ; ns1:dateModified "2024-11-01T11:50:02Z"^^ns1:Date ; ns1:description """# The Polygenic Score Catalog Calculator (`pgsc_calc`) [![Documentation Status](https://readthedocs.org/projects/pgsc-calc/badge/?version=latest)](https://pgsc-calc.readthedocs.io/en/latest/?badge=latest) [![pgscatalog/pgsc_calc CI](https://github.com/PGScatalog/pgsc_calc/actions/workflows/ci.yml/badge.svg)](https://github.com/PGScatalog/pgsc_calc/actions/workflows/ci.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5970794.svg)](https://doi.org/10.5281/zenodo.5970794) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-≥23.10.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) ## Introduction `pgsc_calc` is a bioinformatics best-practice analysis pipeline for calculating polygenic [risk] scores on samples with imputed genotypes using existing scoring files from the [Polygenic Score (PGS) Catalog](https://www.pgscatalog.org/) and/or user-defined PGS/PRS. ## Pipeline summary > [!IMPORTANT] > * Whole genome sequencing (WGS) data [are not currently supported by the calculator](https://pgsc-calc.readthedocs.io/en/latest/explanation/match.html#are-your-target-genomes-imputed-are-they-wgs) > * It’s possible to [create compatible gVCFs from WGS data](https://github.com/PGScatalog/pgsc_calc/discussions/123#discussioncomment-6469422). We plan to improve support for WGS data in the near future.

The workflow performs the following steps: * Downloading scoring files using the PGS Catalog API in a specified genome build (GRCh37 and GRCh38). * Reading custom scoring files (and performing a liftover if genotyping data is in a different build). * Automatically combines and creates scoring files for efficient parallel computation of multiple PGS - Matching variants in the scoring files against variants in the target dataset (in plink bfile/pfile or VCF format) * Calculates PGS for all samples (linear sum of weights and dosages) * Creates a summary report to visualize score distributions and pipeline metadata (variant matching QC) And optionally: - Genetic Ancestry: calculate similarity of target samples to populations in a reference dataset ([1000 Genomes (1000G)](http://www.nature.com/nature/journal/v526/n7571/full/nature15393.html)), using principal components analysis (PCA) - PGS Normalization: Using reference population data and/or PCA projections to report individual-level PGS predictions (e.g. percentiles, z-scores) that account for genetic ancestry See documentation for a list of planned [features under development](https://pgsc-calc.readthedocs.io/en/latest/index.html#Features-under-development). ### PGS applications and libraries `pgsc_calc` uses applications and libraries internally developed at the PGS Catalog, which can do helpful things like: * Query the PGS Catalog to bulk download scoring files in a specific genome build * Match variants from scoring files to target variants * Adjust calculated PGS in the context of genetic ancestry If you want to write Python code to work with PGS, [check out the `pygscatalog` repository to learn more](https://github.com/PGScatalog/pygscatalog). If you want a simpler way of working with PGS, ignore this section and continue below to learn more about `pgsc_calc`. ## Quick start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=23.10.0`) 2. Install [`Docker`](https://docs.docker.com/engine/installation/) or [`Singularity (v3.8.3 minimum)`](https://www.sylabs.io/guides/3.0/user-guide/) (please only use [`Conda`](https://conda.io/miniconda.html) as a last resort) 3. Download the pipeline and test it on a minimal dataset with a single command: ```console nextflow run pgscatalog/pgsc_calc -profile test, ``` 4. Start running your own analysis! ```console nextflow run pgscatalog/pgsc_calc -profile --input samplesheet.csv --pgs_id PGS001229 ``` See [getting started](https://pgsc-calc.readthedocs.io/en/latest/getting-started.html) for more details. ## Documentation [Full documentation is available on Read the Docs](https://pgsc-calc.readthedocs.io/) ## Credits pgscatalog/pgsc_calc is developed as part of the PGS Catalog project, a collaboration between the University of Cambridge’s Department of Public Health and Primary Care (Michael Inouye, Samuel Lambert) and the European Bioinformatics Institute (Helen Parkinson, Laura Harris). The pipeline seeks to provide a standardized workflow for PGS calculation and ancestry inference implemented in nextflow derived from an existing set of tools/scripts developed by Inouye lab (Rodrigo Canovas, Scott Ritchie, Jingqin Wu) and PGS Catalog teams (Samuel Lambert, Laurent Gil). The adaptation of the codebase, nextflow implementation, and PGS Catalog features are written by Benjamin Wingfield, Samuel Lambert, Laurent Gil with additional input from Aoife McMahon (EBI). Development of new features, testing, and code review is ongoing including Inouye lab members (Rodrigo Canovas, Scott Ritchie) and others. If you use the tool we ask you to cite our paper describing software and updated PGS Catalog resource: - >Lambert, Wingfield _et al._ (2024) Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization. Nature Genetics. doi:[10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x). This pipeline is distrubuted under an [Apache License](LICENSE) amd uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community (Ewels *et al. Nature Biotech* (2020) doi:[10.1038/s41587-020-0439-x](https://doi.org/10.1038/s41587-020-0439-x)), reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). Additional references of open-source tools and data used in this pipeline are described in [`CITATIONS.md`](CITATIONS.md). This work has received funding from EMBL-EBI core funds, the Baker Institute, the University of Cambridge, Health Data Research UK (HDRUK), and the European Union’s Horizon 2020 research and innovation programme under grant agreement No 101016775 INTERVENE. """ ; ns1:isBasedOn ; ns1:keywords "Nextflow, Workflows, polygenic risk score, polygenic score, prediction, GWAS, genomic ancestry" ; ns1:license ; ns1:name "The Polygenic Score Catalog Calculator" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble.pcz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble_rmsd.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble_rmsd.dcd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bd_ensemble_uncompressed.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bfactor_all.dat" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bfactor_all.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/dmd_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/dmd_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/hinges_bfactor_report.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mypdb.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/nma_ensemble.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/nma_ensemble.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_collectivity.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_evecs.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_proj1.crd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_proj1.dcd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_report.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/pcz_stiffness.json" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/structure_ca.pdb" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-08-11T08:32:23Z"^^ns1:Date ; ns1:dateModified "2023-08-11T08:34:07Z"^^ns1:Date ; ns1:description """# Macromolecular Coarse-Grained Flexibility (FlexServ) tutorial using BioExcel Building Blocks (biobb) This tutorial aims to illustrate the process of generating protein conformational ensembles from 3D structures and analysing its molecular flexibility, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.557.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Macromolecular Coarse-Grained Flexibility tutorial" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mygodmd_prep.aln" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin-target.godmd.dcd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin-target.godmd.ene.out" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin-target.godmd.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin-target.godmd.mdcrd" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin-target.godmd.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin.chains.nolig.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin.chains.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/origin.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/target.chains.pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/target.pdb" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-08-11T08:36:46Z"^^ns1:Date ; ns1:dateModified "2023-08-11T08:38:31Z"^^ns1:Date ; ns1:description """# Protein Conformational Transitions calculations tutorial using BioExcel Building Blocks (biobb) and GOdMD This tutorial aims to illustrate the process of computing a conformational transition between two known structural conformations of a protein, step by step, using the BioExcel Building Blocks library (biobb). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2023 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2023 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.558.1" ; ns1:keywords "" ; ns1:license ; ns1:name "Galaxy Protein Conformational Transitions calculations tutorial" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "RMD" ; ns1:name "R markdown" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2023-08-30T06:57:48Z"^^ns1:Date ; ns1:dateModified "2023-08-30T07:06:42Z"^^ns1:Date ; ns1:description """We present an R script that describes the workflow for analysing honey bee (_Apis mellifera_) wing shape. It is based on a dataset of wing images and landmark coordinates available at Zenodo: https://doi.org/10.5281/zenodo.8128010. The dataset can be used as a reference for the identification of local bees from southern Kazakhstan, which most probably belong to the subspecies _Apis mellifera pomonella_. It was compared with data from Nawrocka et al. (2018), available at Zenodo: https://doi.org/10.5281/zenodo.7567336. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.559.1" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "Apis-mellifera-wings-KZ: A workflow for morphometric identification of honey bees from Kazakhstan" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T15:04:20Z"^^ns1:Date ; ns1:dateModified "2026-03-24T15:08:48Z"^^ns1:Date ; ns1:description """# Mutation Free Energy Calculations using BioExcel Building Blocks (biobb) *** **Based on the official [pmx tutorial](http://pmx.mpibpc.mpg.de/sardinia2018_tutorial1/index.html).** *** This tutorial aims to illustrate how to compute a **fast-growth mutation free energy** calculation, step by step, using the BioExcel **Building Blocks library (biobb)**. The particular example used is the **Staphylococcal nuclease** protein (PDB code 1STN), a small, minimal protein, appropriate for a short tutorial. The **non-equilibrium free energy calculation** protocol performs a **fast alchemical transition** in the direction **WT->Mut** and back **Mut->WT**. The two equilibrium trajectories needed for the tutorial, one for **Wild Type (WT)** and another for the **Mutated (Mut)** protein (Isoleucine 10 to Alanine -I10A-), have already been generated and are included in this example. We will name **WT as stateA** and **Mut as stateB**. ![](https://raw.githubusercontent.com/bioexcel/biobb_wf_pmx_tutorial/master/biobb_wf_pmx_tutorial/notebooks/schema.png) The tutorial calculates the **free energy difference** in the folded state of a protein. Starting from **two 1ns-length independent equilibrium simulations** (WT and mutant), snapshots are selected to start **fast (50ps) transitions** driving the system in the **forward** (WT to mutant) and **reverse** (mutant to WT) directions, and the **work values** required to perform these transitions are collected. With these values, **Crooks Gaussian Intersection** (CGI), **Bennett Acceptance Ratio** (BAR) and **Jarzynski estimator** methods are used to calculate the **free energy difference** between the two states. *Please note that for the sake of disk space this tutorial is using 1ns-length equilibrium trajectories, whereas in the [original example](http://pmx.mpibpc.mpg.de/sardinia2018_tutorial1/eq.mdp) the equilibrium trajectories used were obtained from 10ns-length simulations.* *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.55.8" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Mutation Free Energy Calculations" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 8 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Jupyter" ; ns1:url . a ns1:Person ; ns1:name "Kenneth Chan" . a ns1:Person ; ns1:name "Martha Zakrzewski" . a ns1:Person ; ns1:name "Naga Kasinadhuni" . a ns1:Person ; ns1:name "Uwe Winter" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , ; ns1:dateCreated "2023-08-31T07:41:03Z"^^ns1:Date ; ns1:dateModified "2024-02-05T03:12:14Z"^^ns1:Date ; ns1:description """# HiFi *de novo* genome assembly workflow HiFi-assembly-workflow is a bioinformatics pipeline that can be used to analyse Pacbio CCS reads for *de novo* genome assembly using PacBio Circular Consensus Sequencing (CCS) reads. This workflow is implemented in Nextflow and has 3 major sections. Please refer to the following documentation for detailed description of each workflow section: - [Adapter filtration and pre-assembly quality control (QC)](https://australianbiocommons.github.io/hifi-assembly-workflow/recommendations#stage-1-adapter-filtration-and-pre-assembly-quality-control) - [Assembly](https://australianbiocommons.github.io/hifi-assembly-workflow/recommendations#stage-2-assembly) - [Post-assembly QC](https://australianbiocommons.github.io/hifi-assembly-workflow/recommendations#stage-3-post-assembly-quality-control) ## General recommendations A more detailed module and workflow description as well as execution examples on Gadi and Setonix are [available here](https://australianbiocommons.github.io/hifi-assembly-workflow/workflows). ## Attributions This work was developed at AGRF and supported by the Australian BioCommons via Bioplatforms Australia funding, the Australian Research Data Commons (https://doi.org/10.47486/PL105) and the Queensland Government RICF programme. Bioplatforms Australia and the Australian Research Data Commons are enabled by the National Collaborative Research Infrastructure Strategy (NCRIS). The documentation in this repository is based on Australian BioCommons guidelines. """ ; ns1:keywords "" ; ns1:license ; ns1:name "HiFi de novo genome assembly workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Minimum number of overlap" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bin_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/effective_genome_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/n rmDup BAMSR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/average_bigwig" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/individual_macs2_narrowPeaks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/merged_macs2_narrowPeaks" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/multiqc_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/shared_narrowPeak" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-04-11T02:02:26Z"^^ns1:Date ; ns1:dateModified "2026-04-11T02:02:26Z"^^ns1:Date ; ns1:description "Identifies high-confidence consensus peaks from ChIP-seq single-end replicate experiments. The workflow calls peaks on individual replicates and identifies their intersection. To control for sequencing depth differences, it subsamples all replicates to the smallest library size, performs peak calling on the combined normalized data, and retains only peaks whose summits overlap with intersections from a user-defined minimum number of replicates." ; ns1:input , , , ; ns1:isBasedOn ; ns1:keywords "ChIP" ; ns1:license ; ns1:name "consensus-peaks/consensus-peaks-chip-sr" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 13 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly to be polished" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "long reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minimap setting (for long reads) " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly polished by long reads using Racon" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-08T02:01:44Z"^^ns1:Date ; ns1:dateModified "2026-04-20T01:01:58Z"^^ns1:Date ; ns1:description "Racon polish with long reads, x4" ; ns1:input , , ; ns1:keywords "" ; ns1:license ; ns1:name "polish-with-long-reads/main" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Fernando Cruz (CNAG)" . a ns1:Person ; ns1:name "Francisco Camara (CNAG)" . a ns1:Person ; ns1:name "Tyler Alioto (CNAG)" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2024-02-02T12:24:07Z"^^ns1:Date ; ns1:dateModified "2024-02-02T12:24:51Z"^^ns1:Date ; ns1:description """# CLAWS (CNAG's Long-read Assembly Workflow in Snakemake) Snakemake Pipeline used for de novo genome assembly @CNAG. It has been developed for Snakemake v6.0.5. It accepts Oxford Nanopore Technologies (ONT) reads, PacBio HFi reads, illumina paired-end data, illumina 10X data and Hi-C reads. It does the preprocessing of the reads, assembly, polishing, purge_dups, scaffodling and different evaluation steps. By default it will preprocess the reads, run Flye + Hypo + purge_dups + yahs and evaluate the resulting assemblies with BUSCO, MERQURY, Nseries and assembly_stats. It needs a config file and a spec file (json file with instructions on which resources should slurm use for each of the jobs). Both files are created by the script "create_config_assembly.py" that is located in the bin directory. To check all the options accepted by the script, do: ``` bin/create_config_assembly.py -h ``` Once the 2 config files are produced, the pipeline can be launched using snakemake like this: ``snakemake --notemp -j 999 --snakefile assembly_pipeline.smk --configfile assembly.config --is --cluster-conf assembly.spec --use-conda --use-envmodules`` If you are using an HPC cluster, please check how should you run snakemake to launch the jobs to the cluster. Most of the tools used will be installed via conda using the environments of the "envs" directory after providing the "--use-conda" option to snakemake. However, a few tools cannot be installed via conda and will have to be available in your PATH, or as a module in the cluster. Those tools are: - NextDenovo/2.5.0 - NextPolish/1.4.1 # How to provide input data: There are several ways of providing the reads. ### 1- ONT reads 1.1 Using the option ``--ont-dir {DIR}`` in create_config_assembly.py. If you do so, it will look for all the files in the directory that end in '.fastq.gz' and will add the basenames to "ONT_wildcards". These wildcards will be processed by the pipeline that will: - Concatenate all the files into a single file - Run filtlong with the default or specified parameters. - Use the resulting file for assembly, polishing and/or purging. You can also specify the basenames of the files that you want to use with the ``--ont-list `` option. In this case, the pipeline will use the wildcards that you're providing instead of merging all the files in the directory. 1.2 Using the option ```--ont-reads {FILE}``` in create_config_assembly.py. If you do so, it will consider that you already have all the reads in one file and will: - Run filtlong with the default or specified parameters. - Use the resulting file for assembly, polishing and/or purging. 1.3 Using the option ```--ont-filt {FILE}```. It will use this file as the output from filtlong. Hence, it will skip the preprocessing steps and directly use it for assembly, polishing and/or purging. ### 2-Illumina 10X-linked data 2.1 Using the ```--raw-10X {DIR:list}``` option. Dictionary with 10X raw read directories, it has to be the mkfastq dir. You must specify as well the sampleIDs from this run. Example: '{"mkfastq- dir":"sample1,sample2,sample3"}'... It will take each basename in the list to get the fastqs from the corresponding directory and run longranger on each sample. Afterwards, it will build meryldbs for each "barcoded" file. Finally, it will concatenate all the meryldbs and "barcoded" files. Resulting "barcoded" file will be used for polishing. 2.2 Using the ``--processed-10X {DIR}`` parameter. This directory can already be there or be produced by the pipeline as described in step 2.1. Once all the "barcoded" fastq files are there, meryldbs will be built for each "barcoded" file. Finally, it will concatenate all the meryldbs and "barcoded" files. Resulting "barcoded" file will be used for polishing. 2.3 Using the ``--10X`` option. The argument to this is the path to the concatenated ".barcoded" file that needs to be used for polishing. If the pre-concatenated files are not given, meryldbs will be directly generated with this file, but it may run out of memory. ### 3- Illumina short-read data 3.1 Using the ``--illumina-dir {DIR}`` option, that will look for all the files in the directory that end in '.1.fastq.gz' and will add the basenames to "illumina_wildcards". These wildcards will be processed by the pipeline that will: - Trim adaptors with Trimgalore - Concatenate all the trimmed *.1.fastq.gz and the *2.fastq.gz in one file per pair. - The resulting reads will be used for building meryldbs and polishing. 3.2 Using the ``--processed-illumina`` option. If the directory exists and contains files, the pipeline will look for all the files in the directory that end in '.1.fastq.gz' and will add the basenames to "illumina_wildcards". These wildcards will be processed by the pipeline that will: - Concatenate all the trimmed *.1.fastq.gz and the *2.fastq.gz in one file per pair. - The resulting reads will be used for building meryldbs and polishing. 3.3 Using the ``--pe1 {FILE} and --pe2 {FILE}`` options. That will consider that these are the paired files containing all the illumina reads ready to be used and will build meryldbs and polish with them. ### 4- Input assemblies If you want to polish an already assembled assembly, you can give it to the pipeline by using the option ``--assembly-in ASSEMBLY_IN [ASSEMBLY_IN ...] Dictionary with assemblies that need to be polished but not assembled and directory where they should be polished. Example: '{"assembly1":"polishing_dir1"}' '{"assembly2"="polishing_dir2"}' ...`` If you want to start the pipeline after polishing on an already existing assembly, you can give it to the pipeline by using the option ``--postpolish-assemblies POSTPOLISH_ASSEMBLIES [POSTPOLISH_ASSEMBLIES ...] Dictionary with assemblies for which postpolishing steps need to be run but that are not assembled and base step for the directory where the first postpolishing step should be run. Example: '{"assembly1":"s04.1_p03.1"}' '{"assembly2"="s04.2_p03.2"}' ...`` To evaluate and produce the final pretext file on a curated assembly, use ``--curated-assemblies CURATED_ASSEMBLIES [CURATED_ASSEMBLIES ...] Dictionary with assemblies that have already been curated. Evaluations and read alignment will be perforder. Example: '{"assembly1":"s04.1_p03.1"}' '{"assembly2":"s04.2_p03.2"}' ...`` # Description of implemented rules 1- Preprocessing: - **Read concatenation:** ``zcat {input.fastqs} | pigz -p {threads} -c > {output.final_fastq}`` - **Longranger for 10X reads**: it uses the Longranger version installed in the path specified in the configfile ``longranger basic --id={params.sample} --sample={params.sample} --fastqs={input.mkfastq_dir} --localcores={threads}`` - **Trimgalore:** By default it gives the ``--max_n 0 --gzip -q 20 --paired --retain_unpaired`` options, but it can be changed with the ``--trim-galore-opts `` argument. ``trim_galore -j {threads} {params.opts} {input.read1} {input.read2}`` - **Filtlong:** it uses the Filtlong version installed in the path specified in the configfile. By default it gives the min_length and min_mean_q parameters, but extra parameters can be added with the ``--filtlong-opts`` option. ``filtlong --min_length {params.minlen} --min_mean_q {params.min_mean_q} {params.opts} {input.reads} | pigz -p {threads} -c > {output.outreads}`` - **Build meryldb**: it uses the merqury conda environment specified in the configfile. It takes as argument the `--mery-k` value that needs to be estimated first for the genome size. It can run either on the illumina reads, the ont reads or both, default behaviour is both. ``meryl k={params.kmer} count output {output.out_dir} {input.fastq}`` - Concat meryldbs: with the merqury conda environment specified in the configfile ``meryl union-sum output {output.meryl_all} {input.input_run}`` - **Align ONT (Minimap2):** it aligns the reads using minimap2 and outputs the alignment either in bam or in paf.gz formats. It uses the minimap2 conda environment specified in the configfile ``minimap2 -{params.align_opts} -t {threads} {input.genome} {input.reads} `` - **Align Illumina (BWA-MEM):** it aligns the reads with BWA-mem and outputs a bam file ``bwa mem -Y {params.options} -t {threads} {input.genome} {input.reads} | samtools view -Sb - | samtools sort -@ {threads} -o {output.mapping} -`` 2- Assembly - **Flye (default)**. It is run by default, if you don't want the pipeline to run it, you can give `--no-flye` option when creating the config. It uses the conda environment specified in the config. By default it is set to 2 polishing iterations and gives the genome-size estimate that has been given when creating the config. Extra options can be provided with the `--flye-opts`. ``flye --{params.readtype} {input.reads} -o {params.outdir}out -t {threads} -i {params.pol_iterations} {params.other_flye_opts} `` - **Nextdenovo (if ``run-nextdenovo``):** It uses the cluster module specified in the config. If nextdenovo option is turned on, the create_config script will also create the nextdenovo config file. Check the create_config help to see which options can be modified on it. ``nextDenovo {input.config}`` 3- Polishing - **Hypo (default):** It is the polisher that the pipeline uses by default, it can be turned off specifying ``--no-hypo`` when creating the config. If selected, the reads will be aligned in previous rules and then hypo will be run, it requires illumina data. It uses the conda environment specified in the config. ``hypo -r @short_reads.list.txt -d {input.genome} -b {input.sr_bam} -c {coverage} -s {params.genome_size} -B {input.lr_bam} -t {threads} -o {output.polished} -p {params.proc} {params.opts} `` - **Nextpolish ont (if turned on):** to run nextpolish with ONT reads, specify ``--nextpolish-ont-rounds`` and the number of rounds you want to run of it. ``"python /apps/NEXTPOLISH/1.3.1/lib/nextpolish2.py -g {input.genome} -p {threads} -l lgs.fofn -r {params.lrtype} > {output.polished}`` - **Nextpolish illumina (if turned on):** to run nextpolish with ONT reads, specify ``--nextpolish-ill-rounds`` and the number of rounds you want to run of it. ``"python /apps/NEXTPOLISH/1.3.1/lib/nextpolish1.py -g {input.genome} -p {threads} -s {input.bam} -t {params.task} > {output.polished}`` 4- Post-assembly - **Purge_dups (by default):** select ``--no-purgedups`` if you don't want to run it. If no manual cutoffs are given, it'll run purgedups with automatic cutoffs and then will rerun it selecting the mean cutoff as 0.75\\*cov. It uses the version installed in the cluster module specified in the config. 5- Evaluations - **Merqury:** It runs on each 'terminal' assembly. This is, the base assembly and the resulting assembly from each branch of the pipeline. - **Busco:** It can be run only in the terminal assemblies or on all the assemblies produced by the pipeline. It uses the conda environment specified in the config as well as the parameters specified. - **Nseries:** This is run during the *finalize* on all the assemblies that are evaluated. After it, that rule combines the statistics produced by all the evaluation rules. # Description of all options ``` bin/create_config_assembly.py -h usage: create_configuration_file [-h] [--configFile configFile] [--specFile specFile] [--ndconfFile ndconfFile] [--concat-cores concat_cores] [--genome-size genome_size] [--lr-type lr_type] [--basename base_name] [--species species] [--keep-intermediate] [--preprocess-lr-step PREPROCESS_ONT_STEP] [--preprocess-10X-step PREPROCESS_10X_STEP] [--preprocess-illumina-step PREPROCESS_ILLUMINA_STEP] [--preprocess-hic-step PREPROCESS_HIC_STEP] [--flye-step FLYE_STEP] [--no-flye] [--nextdenovo-step NEXTDENOVO_STEP] [--run-nextdenovo] [--nextpolish-cores nextpolish_cores] [--minimap2-cores minimap2_cores] [--bwa-cores bwa_cores] [--hypo-cores hypo_cores] [--pairtools-cores pairtools_cores] [--busco-cores busco_cores] [--nextpolish-ont-rounds nextpolish_ont_rounds] [--nextpolish-ill-rounds nextpolish_ill_rounds] [--hypo-rounds hypo_rounds] [--longranger-cores longranger_cores] [--longranger-path longranger_path] [--genomescope-opts genomescope_additional] [--no-purgedups] [--ploidy ploidy] [--run-tigmint] [--run-kraken2] [--no-yahs] [--scripts-dir SCRIPTS_DIR] [--ont-reads ONT_READS] [--ont-dir ONT_DIR] [--ont-filt ONT_FILTERED] [--pe1 PE1] [--pe2 PE2] [--processed-illumina PROCESSED_ILLUMINA] [--raw-10X RAW_10X [RAW_10X ...]] [--processed-10X PROCESSED_10X] [--10X R10X] [--illumina-dir ILLUMINA_DIR] [--assembly-in ASSEMBLY_IN [ASSEMBLY_IN ...]] [--postpolish-assemblies POSTPOLISH_ASSEMBLIES [POSTPOLISH_ASSEMBLIES ...]] [--curated-assemblies CURATED_ASSEMBLIES [CURATED_ASSEMBLIES ...]] [--hic-dir HIC_DIR] [--pipeline-workdir PIPELINE_WORKDIR] [--filtlong-dir FILTLONG_DIR] [--concat-hic-dir CONCAT_HIC_DIR] [--flye-dir FLYE_DIR] [--nextdenovo-dir NEXTDENOVO_DIR] [--flye-polishing-dir POLISH_FLYE_DIR] [--nextdenovo-polishing-dir POLISH_NEXTDENOVO_DIR] [--eval-dir eval_dir] [--stats-out stats_out] [--hic-qc-dir hic_qc_dir] [--filtlong-minlen filtlong_minlen] [--filtlong-min-mean-q filtlong_min_mean_q] [--filtlong-opts filtlong_opts] [--kraken2-db kraken2_db] [--kraken2-kmer kraken2_kmers] [--kraken2-opts additional_kraken2_opts] [--kraken2-cores kraken2_threads] [--trim-galore-opts trim_galore_opts] [--trim-Illumina-cores Trim_Illumina_cores] [--flye-cores flye_cores] [--flye-polishing-iterations flye_pol_it] [--other-flye-opts other_flye_opts] [--nextdenovo-cores nextdenovo_cores] [--nextdenovo-jobtype nextdenovo_type] [--nextdenovo-task nextdenovo_task] [--nextdenovo-rewrite nextdenovo_rewrite] [--nextdenovo-parallel_jobs nextdenovo_parallel_jobs] [--nextdenovo-minreadlen nextdenovo_minreadlen] [--nextdenovo-seeddepth nextdenovo_seeddepth] [--nextdenovo-seedcutoff nextdenovo_seedcutoff] [--nextdenovo-blocksize nextdenovo_blocksize] [--nextdenovo-pa-correction nextdenovo_pa_correction] [--nextdenovo-minimap_raw nextdenovo_minimap_raw] [--nextdenovo-minimap_cns nextdenovo_minimap_cns] [--nextdenovo-minimap_map nextdenovo_minimap_map] [--nextdenovo-sort nextdenovo_sort] [--nextdenovo-correction_opts nextdenovo_correction_opts] [--nextdenovo-nextgraph_opt nextdenovo_nextgraph_opt] [--sr-cov ill_cov] [--hypo-proc hypo_processes] [--hypo-no-lr] [--hypo-opts hypo_opts] [--purgedups-cores purgedups_cores] [--purgedups-calcuts-opts calcuts_opts] [--tigmint-cores tigmint_cores] [--tigmint-opts tigmint_opts] [--hic-qc] [--no-pretext] [--assembly-qc assembly_qc] [--yahs-cores yahs_cores] [--yahs-mq yahs_mq] [--yahs-opts yahs_opts] [--hic-map-opts hic_map_opts] [--mq mq [mq ...]] [--hic-qc-assemblylen hic_qc_assemblylen] [--blast-cores blast_cores] [--hic-blastdb blastdb] [--hic-readsblast hic_readsblast] [--no-final-evals] [--busco-lin busco_lineage] [--merqury-db merqury_db] [--merqury-plot-opts merqury_plot_opts] [--meryl-k meryl_k] [--meryl-threads meryl_threads] [--meryl-reads meryl_reads [meryl_reads ...]] [--ont-list ONT_wildcards] [--illumina-list illumina_wildcards] [--r10X-list r10X_wildcards] [--hic-list hic_wildcards] Create a configuration json file for the assembly pipeline. options: -h, --help show this help message and exit General Parameters: --configFile configFile Configuration JSON to be generated. Default assembly.config --specFile specFile Cluster specifications JSON fileto be generated. Default assembly.spec --ndconfFile ndconfFile Name pf the nextdenovo config file. Default nextdenovo.config --concat-cores concat_cores Number of threads to concatenate reads and to run filtlong. Default 4 --genome-size genome_size Approximate genome size. Example: 615m or 2.6g. Default None --lr-type lr_type Type of long reads (options are flye read-type options). Default nano-hq --basename base_name Base name for the project. Default None --species species Name of the species to be assembled. Default None --keep-intermediate Set this to True if you do not want intermediate files to be removed. Default False --preprocess-lr-step PREPROCESS_ONT_STEP Step for preprocessing long-reads. Default 02.1 --preprocess-10X-step PREPROCESS_10X_STEP Step for preprocessing 10X reads. Default 02.2 --preprocess-illumina-step PREPROCESS_ILLUMINA_STEP Step for preprocessing illumina reads. Default 02.2 --preprocess-hic-step PREPROCESS_HIC_STEP Step for preprocessing hic reads. Default 02.3 --flye-step FLYE_STEP Step for running flye. Default 03.1 --no-flye Give this option if you do not want to run Flye. --nextdenovo-step NEXTDENOVO_STEP Step for running nextdenovo. Default 03.2 --run-nextdenovo Give this option if you do want to run Nextdenovo. --nextpolish-cores nextpolish_cores Number of threads to run the nextpolish step. Default 24 --minimap2-cores minimap2_cores Number of threads to run the alignment with minimap2. Default 32 --bwa-cores bwa_cores Number of threads to run the alignments with BWA-Mem2. Default 16 --hypo-cores hypo_cores Number of threads to run the hypo step. Default 24 --pairtools-cores pairtools_cores Number of threads to run the pairtools step. Default 100 --busco-cores busco_cores Number of threads to run BUSCO. Default 32 --nextpolish-ont-rounds nextpolish_ont_rounds Number of rounds to run the Nextpolish with ONT step. Default 0 --nextpolish-ill-rounds nextpolish_ill_rounds Number of rounds to run the Nextpolish with illumina step. Default 0 --hypo-rounds hypo_rounds Number of rounds to run the Hypostep. Default 1 --longranger-cores longranger_cores Number of threads to run longranger. Default 16 --longranger-path longranger_path Path to longranger executable. Default /scratch/project/devel/aateam/src/10X/longranger-2.2.2 --genomescope-opts genomescope_additional Additional options to run Genomescope2 with. Default -m 10000 --no-purgedups Give this option if you do not want to run Purgedups. --ploidy ploidy Expected ploidy. Default 2 --run-tigmint Give this option if you want to run the scaffolding with 10X reads step. --run-kraken2 Give this option if you want to run Kraken2 on the input reads. --no-yahs Give this option if you do not want to run yahs. Inputs: --scripts-dir SCRIPTS_DIR Directory with the different scripts for the pipeline. Default /software/assembly/pipelines/Assembly_pipeline/CLAWSv2.2/bin/../scripts/ --ont-reads ONT_READS File with all the ONT reads. Default None --ont-dir ONT_DIR Directory where the ONT fastqs are stored. Default None --ont-filt ONT_FILTERED File with the ONT reads after running filtlong on them. Default None --pe1 PE1 File with the illumina paired-end fastqs, already trimmed, pair 1. --pe2 PE2 File with the illumina paired-end fastqs, already trimmed, pair 2. --processed-illumina PROCESSED_ILLUMINA Directory to Processed illumina reads. Already there or to be produced by the pipeline. --raw-10X RAW_10X [RAW_10X ...] Dictionary with 10X raw read directories, it has to be the mkfastq dir. You must specify as well the sampleIDs from this run. Example: '{"mkfastq-dir":"sample1,sample2,sample3"}'... --processed-10X PROCESSED_10X Directory to Processed 10X reads. Already there or to be produced by the pipeline. --10X R10X File with barcoded 10X reads in fastq.gz format, concatenated. --illumina-dir ILLUMINA_DIR Directory where the raw illumina fastqs are stored. Default None --assembly-in ASSEMBLY_IN [ASSEMBLY_IN ...] Dictionary with assemblies that need to be polished but not assembled and directory where they should be polished. Example: '{"assembly1":"polishing_dir1"}' '{"assembly2"="polishing_dir2"}' ... --postpolish-assemblies POSTPOLISH_ASSEMBLIES [POSTPOLISH_ASSEMBLIES ...] Dictionary with assemblies for whic postpolishing steps need to be run but that are not assembled and base step for the directory where the first postpolishing step should be run. Example: '{"assembly1":"s04.1_p03.1"}' '{"assembly2":"s04.2_p03.2"}' ... --curated-assemblies CURATED_ASSEMBLIES [CURATED_ASSEMBLIES ...] Dictionary with assemblies that have already been curated. Evaluations and read alignment will be perforder. Example: '{"assembly1":"s04.1_p03.1"}' '{"assembly2":"s04.2_p03.2"}' ... --hic-dir HIC_DIR Directory where the HiC fastqs are stored. Default None Outputs: --pipeline-workdir PIPELINE_WORKDIR Base directory for the pipeline run. Default /scratch_isilon/groups/assembly/jgomez/test_CLAWSv2/ilErePala/assembly/ --filtlong-dir FILTLONG_DIR Directory to process the ONT reads with filtlong. Default s02.1_p01.1_Filtlong --concat-hic-dir CONCAT_HIC_DIR Directory to concatenate the HiC reads. Default s02.3_p01.1_Concat_HiC --flye-dir FLYE_DIR Directory to run flye. Default s03.1_p02.1_flye/ --nextdenovo-dir NEXTDENOVO_DIR Directory to run nextdenovo. Default s03.2_p02.1_nextdenovo/ --flye-polishing-dir POLISH_FLYE_DIR Directory to polish the flye assembly. Default s04.1_p03.1_polishing/ --nextdenovo-polishing-dir POLISH_NEXTDENOVO_DIR Directory to run nextdenovo. Default s04.2_p03.2_polishing/ --eval-dir eval_dir Base directory for the evaluations. Default evaluations/ --stats-out stats_out Path to the file with the final statistics. --hic-qc-dir hic_qc_dir Directory to run the hic_qc. Default hic_qc/ Filtlong: --filtlong-minlen filtlong_minlen Minimum read length to use with Filtlong. Default 1000 --filtlong-min-mean-q filtlong_min_mean_q Minimum mean quality to use with Filtlong. Default 80 --filtlong-opts filtlong_opts Extra options to run Filtlong (eg. -t 4000000000) Kraken2: --kraken2-db kraken2_db Database to be used for running Kraken2. Default None --kraken2-kmer kraken2_kmers Database to be used for running Kraken2. Default None --kraken2-opts additional_kraken2_opts Optional parameters for the rule Kraken2. Default --kraken2-cores kraken2_threads Number of threads to run the Kraken2 step. Default 16 Trim_Galore: --trim-galore-opts trim_galore_opts Optional parameters for the rule trim_galore. Default --max_n 0 --gzip -q 20 --paired --retain_unpaired --trim-Illumina-cores Trim_Illumina_cores Number of threads to run the Illumina trimming step. Default 8 Flye: --flye-cores flye_cores Number of threads to run FLYE. Default 128 --flye-polishing-iterations flye_pol_it Number of polishing iterations to use with FLYE. Default 2 --other-flye-opts other_flye_opts Additional options to run Flye. Default --scaffold Nextdenovo: --nextdenovo-cores nextdenovo_cores Number of threads to run nextdenovo. Default 2 --nextdenovo-jobtype nextdenovo_type Job_type for nextdenovo. Default slurm --nextdenovo-task nextdenovo_task Task need to run. Default all --nextdenovo-rewrite nextdenovo_rewrite Overwrite existing directory. Default yes --nextdenovo-parallel_jobs nextdenovo_parallel_jobs Number of tasks used to run in parallel. Default 50 --nextdenovo-minreadlen nextdenovo_minreadlen Filter reads with length < minreadlen. Default 1k --nextdenovo-seeddepth nextdenovo_seeddepth Expected seed depth, used to calculate seed_cutoff, co-use with genome_size, you can try to set it 30-45 to get a better assembly result. Default 45 --nextdenovo-seedcutoff nextdenovo_seedcutoff Minimum seed length, <=0 means calculate it automatically using bin/seq_stat. Default 0 --nextdenovo-blocksize nextdenovo_blocksize Block size for parallel running, split non-seed reads into small files, the maximum size of each file is blocksize. Default 1g --nextdenovo-pa-correction nextdenovo_pa_correction number of corrected tasks used to run in parallel, each corrected task requires ~TOTAL_INPUT_BASES/4 bytes of memory usage, overwrite parallel_jobs only for this step. Default 100 --nextdenovo-minimap_raw nextdenovo_minimap_raw minimap2 options, used to find overlaps between raw reads, see minimap2-nd for details. Default -t 30 --nextdenovo-minimap_cns nextdenovo_minimap_cns minimap2 options, used to find overlaps between corrected reads. Default -t 30 --nextdenovo-minimap_map nextdenovo_minimap_map minimap2 options, used to map reads back to the assembly. Default -t 30 --no-kalloc --nextdenovo-sort nextdenovo_sort sort options, see ovl_sort for details. Default -m 400g -t 20 --nextdenovo-correction_opts nextdenovo_correction_opts Correction options. Default -p 30 -dbuf --nextdenovo-nextgraph_opt nextdenovo_nextgraph_opt nextgraph options, see nextgraph for details. Default -a 1 Hypo: --sr-cov ill_cov Approximate short read coverage for hypo Default 0 --hypo-proc hypo_processes Number of contigs to be processed in parallel by HyPo. Default 6 --hypo-no-lr Set this to false if you don¡t want to run hypo with long reads. Default True --hypo-opts hypo_opts Additional options to run Hypo. Default None Purge_dups: --purgedups-cores purgedups_cores Number of threads to run purgedups. Default 8 --purgedups-calcuts-opts calcuts_opts Adjusted values to run calcuts for purgedups. Default None Scaffold_with_10X: --tigmint-cores tigmint_cores Number of threads to run the 10X scaffolding step. Default 12 --tigmint-opts tigmint_opts Adjusted values to run the scaffolding with 10X reads. Default None HiC: --hic-qc Give this option if only QC of the HiC data needs to be done. --no-pretext Give this option if you do not want to generate the pretext file --assembly-qc assembly_qc Path to the assembly to be used perfom the QC of the HiC reads. --yahs-cores yahs_cores Number of threads to run YAHS. Default 48 --yahs-mq yahs_mq Mapping quality to use when running yahs.Default 40 --yahs-opts yahs_opts Additional options to give to YAHS.Default --hic-map-opts hic_map_opts Options to use with bwa mem when aligning the HiC reads. Deafault -5SP -T0 --mq mq [mq ...] Mapping qualities to use for processing the hic mappings. Default [0, 40] --hic-qc-assemblylen hic_qc_assemblylen Lentgh of the assembly to be used for HiC QC --blast-cores blast_cores Number of threads to run blast with the HiC unmapped reads.Default 8 --hic-blastdb blastdb BLAST Database to use to classify the hic unmapped reads. Default /scratch_isilon/groups/assembly/data/blastdbs --hic-readsblast hic_readsblast Number of unmapped hic reads to classify with blast. Default 100 Finalize: --no-final-evals If specified, do not run evaluations on final assemblies. Default True --busco-lin busco_lineage Path to the lineage directory to run Busco with. Default None --merqury-db merqury_db Meryl database. Default None --merqury-plot-opts merqury_plot_opts Meryl database. Default None --meryl-k meryl_k Merqury plot additional options, for example " -m 200 -n 6000|". Default None --meryl-threads meryl_threads Number of threads to run meryl and merqury. Default 4 --meryl-reads meryl_reads [meryl_reads ...] Type of reads to be used to build the meryldb. Default ont illumina Wildcards: --ont-list ONT_wildcards List with basename of the ONT fastqs that will be used. Default None --illumina-list illumina_wildcards List with basename of the illumina fastqs. Default None --r10X-list r10X_wildcards List with basename of the raw 10X fastqs. Default None --hic-list hic_wildcards List with basename of the raw hic fastqs. Default None ``` # Changes made to v2.2: 1. General: Now default read_type is nano-hq 2. Rule trim_galore: "--max_n 0" has been added to the default behaviour of "--trim-galore-opts" 3. Meryl: New option "--meryl-reads" has been added to the config. Default is "Illumina ont" to build the meryl database using both type of reads, it can be changed to one or the other 4. Merqury: Option "--merqury-plot-opts" has been added to config file. It can be used to modify the x and y axis maximum values (eg. --merqury-plot-opts " -m 200 -n 6000") 5. Genomescope: "-m 10000" is now part of the default behavior of "--genomescope-opts" 6. Hic_statistics: This is now running for each assembly and mq for which a pretext file is generated 7. Assembly inputs for different steps: a. "--assembly-in" to start after assembly step (eg. Evaluation, polishing, purging and scaffolding) b. "--postpolish-assemblies" to start after polishing step (eg. Evaluation, purging and scaffolding) c. "--curated-assemblies" to start after scaffolding step (eg. Evaluation and pretext generation) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.567.2" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "Bioinformatics, Genomics" ; ns1:license ; ns1:name "CLAWS (CNAG's long-read assembly workflow in Snakemake)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-12T19:29:55Z"^^ns1:Date ; ns1:dateModified "2023-09-13T13:41:08Z"^^ns1:Date ; ns1:description """# ERGA Protein-coding gene annotation workflow. Adapted from the work of Sagane Joye: https://github.com/sdind/genome_annotation_workflow ## Prerequisites The following programs are required to run the workflow and the listed version were tested. It should be noted that older versions of snakemake are not compatible with newer versions of singularity as is noted here: [https://github.com/nextflow-io/nextflow/issues/1659](https://github.com/nextflow-io/nextflow/issues/1659). `conda v 23.7.3` `singularity v 3.7.3` `snakemake v 7.32.3` You will also need to acquire a licence key for Genemark and place this in your home directory with name `~/.gm_key` The key file can be obtained from the following location, where the licence should be read and agreed to: http://topaz.gatech.edu/GeneMark/license_download.cgi ## Workflow The pipeline is based on braker3 and was tested on the following dataset from Drosophila melanogaster: [https://doi.org/10.5281/zenodo.8013373](https://doi.org/10.5281/zenodo.8013373) ### Input data - Reference genome in fasta format - RNAseq data in paired-end zipped fastq format - uniprot fasta sequences in zipped fasta format ### Pipeline steps - **Repeat Model and Mask** Run RepeatModeler using the genome as input, filter any repeats also annotated as protein sequences in the uniprot database and use this filtered libray to mask the genome with RepeatMasker - **Map RNAseq data** Trim any remaining adapter sequences and map the trimmed reads to the input genome - **Run gene prediction software** Use the mapped RNAseq reads and the uniprot sequences to create hints for gene prediction using Braker3 on the masked genome - **Evaluate annotation** Run BUSCO to evaluate the completeness of the annotation produced ### Output data - FastQC reports for input RNAseq data before and after adapter trimming - RepeatMasker report containing quantity of masked sequence and distribution among TE families - Protein-coding gene annotation file in gff3 format - BUSCO summary of annotated sequences ## Setup Your data should be placed in the `data` folder, with the reference genome in the folder `data/ref` and the transcript data in the foler `data/rnaseq`. The config file requires the following to be given: ``` asm: 'absolute path to reference fasta' snakemake_dir_path: 'path to snakemake working directory' name: 'name for project, e.g. mHomSap1' RNA_dir: 'absolute path to rnaseq directory' busco_phylum: 'busco database to use for evaluation e.g. mammalia_odb10' ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.569.1" ; ns1:isPartOf ; ns1:keywords "Annotation, Genomics, Transcriptomics, Biodiversity" ; ns1:license ; ns1:name "ERGA Protein-coding gene annotation workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-24T14:19:20Z"^^ns1:Date ; ns1:dateModified "2026-03-24T14:21:14Z"^^ns1:Date ; ns1:description """# Protein Ligand Complex MD Setup tutorial using BioExcel Building Blocks (biobb) **Based on the official [GROMACS tutorial](http://www.mdtutorials.com/gmx/complex/index.html).** *** This tutorial aims to illustrate the process of **setting up a simulation system** containing a **protein in complex with a ligand**, step by step, using the **BioExcel Building Blocks library (biobb)**. The particular example used is the **T4 lysozyme** L99A/M102Q protein (PDB code 3HTB), in complex with the **2-propylphenol** small molecule (3-letter Code JZ4). *** ## Copyright & Licensing This software has been developed in the [MMB group](http://mmb.irbbarcelona.org) at the [BSC](http://www.bsc.es/) & [IRB](https://www.irbbarcelona.org/) for the [European BioExcel](http://bioexcel.eu/), funded by the European Commission (EU H2020 [823830](http://cordis.europa.eu/projects/823830), EU H2020 [675728](http://cordis.europa.eu/projects/675728)). * (c) 2015-2026 [Barcelona Supercomputing Center](https://www.bsc.es/) * (c) 2015-2026 [Institute for Research in Biomedicine](https://www.irbbarcelona.org/) Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), see the file LICENSE for details. ![](https://bioexcel.eu/wp-content/uploads/2019/04/Bioexcell_logo_1080px_transp.png "Bioexcel")""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.56.9" ; ns1:isBasedOn ; ns1:isPartOf , ; ns1:keywords "" ; ns1:license ; ns1:name "Jupyter Notebook Protein Ligand Complex MD Setup tutorial" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 9 . a ns1:Person ; ns1:name "Payam Emami" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-15T14:36:32Z"^^ns1:Date ; ns1:dateModified "2023-09-15T15:36:33Z"^^ns1:Date ; ns1:description """This workflow represents the Default ML Pipeline for AutoML feature from MLme. Machine Learning Made Easy (MLme) is a novel tool that simplifies machine learning (ML) for researchers. By integrating four essential functionalities, namely data exploration, AutoML, CustomML, and visualization, MLme fulfills the diverse requirements of researchers while eliminating the need for extensive coding efforts. MLme serves as a valuable resource that empowers researchers of all technical levels to leverage ML for insightful data analysis and enhance research outcomes. By simplifying and automating various stages of the ML workflow, it enables researchers to allocate more time to their core research tasks, thereby enhancing efficiency and productivity. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.571.1" ; ns1:keywords "Bioinformatics, Machine Learning, automated workflows, GUI" ; ns1:license ; ns1:name "MLme: Machine Learning Made Easy" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/input" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker masked genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker output log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker repeat annotation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker repeat catalog" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker repeat statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatModeler consensus sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatModeler seeds alignments" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-22T02:01:45Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:35Z"^^ns1:Date ; ns1:description """# RepeatMasking Workflow This workflow uses RepeatModeler and RepeatMasker for genome analysis. - RepeatModeler is a software package for identifying and modeling de novo families of transposable elements (TEs). At the heart of RepeatModeler are three de novo repeat search programs (RECON, RepeatScout and LtrHarvest/Ltr_retriever) which use complementary computational methods to identify repeat element boundaries and family relationships from sequence data. - RepeatMasker is a program that analyzes DNA sequences for *interleaved repeats* and *low-complexity* DNA sequences. The result of the program is a detailed annotation of the repeats present in the query sequence, as well as a modified version of the query sequence in which all annotated repeats are present. ## Input dataset for RepeatModeler - RepeatModeler requires a single input file, a genome in fasta format. ## Outputs dataset for RepeatModeler - Two output files are generated: - summary file (.tbl) - fasta file containing alignments in order of appearance in the query sequence ## Input dataset for RepeatMasker - ReapatMasker requires the fasta file generated by RepeatModeler ## Outputs datasets for RepeatMasker - Five output files are generated: - a fasta file - .gff3 file - a table summarizing the repeated content of the sequence analyzed - a file with statistics related to the repeated content of the sequence analyzed - a summary of the mutation sites found and the order of grouping """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.575.1" ; ns1:input ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "repeatmasking/main" ; ns1:output , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-09-25T12:10:08Z"^^ns1:Date ; ns1:dateModified "2023-09-25T14:35:37Z"^^ns1:Date ; ns1:description "This repository contains the python code to reproduce the experiments in Dłotko, Gurnari \"Euler Characteristic Curves and Profiles: a stable shape invariant for big data problems\"" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.576.1" ; ns1:keywords "" ; ns1:license ; ns1:name "ECP experiments" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Bigwig to average" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bin_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/average_bigwigs" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-04T02:01:49Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:29Z"^^ns1:Date ; ns1:description """We assume the identifiers of the input list are like: sample_name_replicateID. The identifiers of the output list will be: sample_name""" ; ns1:input , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "average-bigwig-between-replicates/main" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-06-11T12:55:00Z"^^ns1:Date ; ns1:dateModified "2024-06-11T12:55:00Z"^^ns1:Date ; ns1:description "Pre-processing of mass spectrometry-based metabolomics data" ; ns1:isBasedOn ; ns1:keywords "Metabolomics, identification, quantification, mass-spectrometry, ms1, ms2" ; ns1:license ; ns1:name "nf-core/metaboigniter" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputerLanguage ; ns1:name "Bpipe" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-10-07T07:38:07Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:52Z"^^ns1:Date ; ns1:description """# RNA-Seq pipeline Here we provide the tools to perform paired end or single read RNA-Seq analysis including raw data quality control, differential expression (DE) analysis and functional annotation. As input files you may use either zipped fastq-files (.fastq.gz) or mapped read data (.bam files). In case of paired end reads, corresponding fastq files should be named using *.R1.fastq.gz* and *.R2.fastq.gz* suffixes. ## Pipeline Workflow All analysis steps are illustrated in the pipeline [flowchart](https://www.draw.io/?lightbox=1&highlight=0000ff&edit=_blank&layers=1&nav=1&title=NGSpipe2go_RNAseq_pipeline.html#R7R1Zk5s489e4Knmwi%2Ft4nDOz2WSyO5OtbPYlJZCw%2BYLBATzXr%2F90AOYQNtgY8Ex2UjuDACG1Wn13ayJfLJ8%2BhGC1%2BBxA5E0kAT5N5MuJJImKJE3IPwE%2BsxZd11nDPHRh8tCm4d59QUmjkLSuXYiiwoNxEHixuyo22oHvIzsutIEwDB6LjzmBV%2FzqCsxRpeHeBl619ZsL40XSKmrm5sYNcueL5NOGlMzPAvbPeRis%2FeR7fuAjdmcJ0m6SOUYLAIPHXJN8NZEvwiCI2V%2FLpwvkEbCmEGPvXdfczYYcIj9u8sLiRdX%2F%2FXT3%2FOR9Xj7NbwX7r3%2FtaTa4%2BDmFBYIYNMllEMaLYB74wLvatJ7T%2BSLSrYCv%2FrdertLn52CFWzZvfQoC3HApksdQHD8nKw%2FWcYCbFvHSS%2B6iJzf%2Bl3Q4U5Or77k7l0%2FJt%2BjFc3rhx%2BFz7iVy%2BT1%2Fb%2FMavUrfi%2BIw%2BJmtM16H8yooE%2BhGwTq0E7g82Ld%2F339bf1xfvvz39Y%2Bv19%2Bjh%2F%2BmcoIgMQjnKN7yoJIAmkA394lkqT6gYInwGPEDIfJA7D4UERMk%2BD3PntssNP4jWWv%2But8IH53FrRF7D4s76%2Fbm%2FvrHy8003XgPwFsnn5pImodncA7dB%2FJFz5379Ib2a01Q9Dyk6J9d4r%2FmyW%2F6mhWWW%2FDAaF9pawnPcqvvAQt559lWugi8IKQPydf0P%2B6iOYEfJ%2FgkamTcIFpQpBQLKEquHNfz8p1K5CfrNL1D9658Pg8BdDEilJrtYOnaCQbNPRBFKTal21qgn43x4gUEcFNT2IZZDyiM0dNWTEjuGmpCaxLaOjU1dv2Yo1RGQoAXOSKlpIt8CPZwkVl80%2BhzpZGfY6BPB9giGUIBWzIWm8MWRTCr2KJ1gS0v3%2BLLr8K%2F%2F328vrsKlyL8%2BuH5j6l8MIt5SwylIT%2BRlaH4CXeNVfG1LzLy4RmRNPGlTfYv2c2k8dr10uFU8aBAVHpACnkopBDlx9sb9zE4%2B3O%2B%2BvLjW%2FDxXBGnEodJ9I8kXJAcdx0S%2FtgD3LcNk8OcCdMrLEDKjsmNaURBeIYfEJXVE4dXX0iTs%2FMlcH0CP3eFPBfzNtx4vp2Ns68Wm5mYsO9AuplO0ku0An5FHCk3ALLublz6TuiDCP2apbCYYQEkeHjOfypdqbQFqMCWFc1GpiVA3QKCaiqm4uiOo5qaIAlT2wRQRbbqiFDSZR0auqBrjgUdVRUECymKAnUgI6vwkUWInMJnFnFMVOczgkjS9dyNF2trhmUQfOEuLSfACIz%2FvP1wT4YuzQN8YXmBRdANRDHCq3idTirCf9%2FdnuF54j92T5iBrfa5FCcAB08qQOdiE2%2B5apCsLEKOBvWmlCK5MbDw27m9JAkQOa7vUh1CEt5Z5M771ptr1PP2AgAjSgujFbIpNZQEOlFiNQng2iMod%2FrzQ1GEGYwLCLkGPswmNyXTdh0sP1AmBDDNJ%2Fttjwl3gINPyF7HBN45HKwZSb3%2B1YXaYooFtcWoai2yztFaROlYOq6yW3jButuK%2FInp6gpreWTyTBZJRUAjbchshwoXXFu5eGMYihWQpcDJQyxtayyKJN%2F7K3AptiUfm0pqYcH0YgeB40RYRCovQjboA2wP5pCaRvr394LWsUvTKOgZG7WjB3VSa6o5DKU4bBt1rXWpSvGs4IkQPNefM5JnBSFE4RQ34wsmIyX7ghJEAW8CNbuTN%2BKT2xNJRgr5yZ5YAQizvqVt%2FKMiw%2FDEFb4waWNyG2LpK5rFTwVLWVmGhKYGRV2Fmg5EYAiKaIgSFiNFxZIMgGQw1R1bMETTAVARJB0CSVJkU9ZsQVRtLGealiqogoNFxkFkyNp5MiCVb2%2BTGGtkwcoi8PlnKn5V0akeV8qsFS0LQLQxduNB2RRZch0KlacylMo9JdXjVWwF8LnSWBEIY1hucdMGvGD405dXVBwB3nPk4lXB%2B1OYMxGMeq6EFPoE6ckSCkwmwM9pYEmcO74VrdgssFQ%2FY1D488tD9O0rnsq7P79Mv319P8vB3eWsRXWU1RausFt9jDRWodAGLh%2Fwzl%2FhSfhYEosYNcM7zs%2BoZhkPxzqhhv2%2FkYGRRt6WIe1s29fItUTecm3gnSUejJhICOepP8NDDnkrwE85HjUGOtT%2Bl5cVHhdujO4xESA9PoZgtY2DtxCRi4Z9WZQ48p5YFfiMo3mB1N%2BSWHNJLF2u0Yhi%2F0Qo%2FGL9j4RTYCpPXGhvR9QqUvZaQUu1DV3XHMlxREUxVUeVbRGopmTqjgMdxZqKGJUtyTItLFxZQFZMXVRUJBkOhIajiwKybSQpUB1G0KqZJQMQn7n9FrJaCVkRlom8zGSyN6Nzy%2F1RYaS1zNGaQ2Pe1eHgQwSgjbc2oSekZzYHISd3vlv77q81xYa1NcXUlawNIReU2cDg%2FbYpH0tMmTP5rysozHPiJJGqHTLbApr0ObkQrTwszcSdLnPWJaGXR5pUO%2Fltu5wi5RhoIticoMCXBSPuEvjEPYQBfJmTB1oIgZLxWwhsLgSmRuSTEQJXXM69xMPHGjJlzsJqI%2Fex9indQeSelruHkT6eJrvqjLJnn%2Fk9dkl3f22cYzx3SUUM4VKdVbltUXlqr6ky8kBuGnxR9ZwKuW3l08x3NHsAYdTArWxZqmBAKIuyqEhYaIVAFIGlSw6CyNEgmiITmhDaClINBe9QoBu6ZGL6AQSkmo4sQQgVU5Hso0uqmYdvI6funC0DWd1jjWXXRONogybHQYoSR12FwYMLCYLnfYY8ZN%2FKYukIKHRv8YsFIBVmzvkue43MleubFN4Fq3jWXjrLgXMnl5a2c%2Bnx8eMs6WEXPzb11mR5f3bMi%2F3p1W2ZfnC0fkuluIrFDo7nt5R%2B%2By0369W%2F%2FMPHBimNhE6wQUv3T9oFk%2BiStzZIUe3IKMYviErJ6spmXOnoLAzBc%2B6xFXkg2jJgVeMOuHZcpefTcW2Qm42gW1TnxU6MUZ60A99x5%2BuQ5mzkg454tq1RiJCgkVSGBflZ6K1mPiFltZLZdCOdQfRAMvquGUBIO3lsxi5rBTL%2BQ7XiWI0dkRt%2BGfKnXydB9wGSVYjA0sKiay08OE%2FsYVftSQbtEav4xukg8OpBWbpbC8aq2NkCywYT%2BCsNNOqTUKTHIPxJAghp3z6YU%2Bl7m9zNazukIUSM5UUtvvuuuwFiQIC1RxiYm4wggWXkrcOCF4BELrjEuOoH5HEfISIo4cXb%2FRGCv0noY5LTlXbrBTbRvXIL2Ghe7ydpiCfvLkfNyihF9iJBeGKCwQNImBFWusji%2F0SI4OQGMFg%2BjV1%2FHu2MsihpX6NWplRZ52gBHDVA1EoSWmfu7DSgcUDVSWkLxVenOnGT1MXqQgyapD6MMtVVLlnjZDLtKDpYW41HF4wC1kmatlXjKT8vG%2BohGs8uA3mmKVyDKP77gkoBTWzTv9aYCMfPkyTijhJ%2F6isMwSOFKU0iIPPAktPPZHN0aX3Fj%2BH%2FO%2FjGL3vGmjN5a6exjiFRrbGORyZLPCfx33%2BiV5eyVsNpaJZ20rEw4eT1HcSE1DJuyZzUeo1DUTWhHssPMOnxU2sPJn49kbrjkSylIclSjmM2akuylFIOvpxYmepIVuV5xZwcQLKaodWOPArkWcHjb5Saqp3XbekEpRRVOj6K7CgO9BtFEqpjjhNFxD5QZHvdhzeCIsd2U6hiaWlTZ1LH3oUKI9J6YESpQjL%2B0iGd1nlpTF06V8MOK%2BazPd7rJDZ8ppNnF6xTaZdS3sdy68OQFE3rh6QoRrckhWsw0oekKCNztXeHmnLncs6eQYmZWeX%2B69ldY4vLHUrcKqsVDTs%2Fom0lc3JFGKjtjStKDqzHNK4ct4SDlNZrGMiYwodt1ciPF2t6T1ZL2EQGlKgHiTXIQTkJNShapwgUy6HkSxdCSlxKC7JpuUvAIE9qygnSipI04J3t%2FITe4mGp5%2FgfBt0FIRfqJfGIqefi5hr%2FI4%2BH8UXg466BS1cWYWx9RFE8qVZA7AAHZIN8vIAFSgUJuH4dWWmNBA1XvL6YwG5fLy2cknh8hT2CT76s49Wa5sMEHiQoc1ahYfjmL3ucoSeZuzLIpuEynyXL6dmMu0n05%2BEpF3mETUt4XoOl6xFcu0HeAyK9TvpwX%2BpKsSirrKrN8HwPYtcQz%2FWR4blQQfUQRWsvHmmcVT5zLdo2nGN8HLqOg8Is9Bs9rTCsIlYErAq1Xob04QsZiB%2B69mKJ%2FM33x7vVj7CtVaEqw3C39dGS7FN9e8Tb2gvmI93TeGTTlGXZgedhYZHGCDH2hYC9mGyCcl49MityjzwqejGMp2vBVeX%2FxPnl8i5a3vijO22gnaq%2Bs%2F4vvko%2BScTpxICCQpdm0dxnCetdK%2FBpqPlOBd4czJS4ddyjIW1VyTxERIEaKXXbN7nw8opNa3a3hPleKzmF0BINZDuSbkuOYQFoObZkAmSqOhJMQZyKliWZJoIahrJsAMG0LUMVBc2EDrB1UVQUAxi6WvzIMXIKabQw%2Fp0t13W0cP3nH6wK7Q%2FWjDffDxZleV0HAQa90t22IdxtY9d5CzSj48fDmC2Qt0JhNLvbtlJANGwk6tAwdFs2bVGWbVWWoQmRIUBoy1PbkgQJiQCJoi7aUHdMVdaBaMqGZTqKSRZMEnTdGuFK7QBFumT8x8Yafv8XHiim9BmFIXQoJIDMlwKmksssh0FZkO%2FnACsMhFBTDBVYGQ%2FM1mMiz9jrKKZdbaq8bXI5cwHHeHmjBSZ%2BWc2zNIR6z6Di16fUa5wqtPygZFk6nEfyg78OL6%2BQl5gSo2L%2FIlPy4Y28dFS%2F%2FJHiwDixpsX0wTRwa5cTrcOwU9cnhoOnFRUz%2FbixNwQSarBkhIZYhylNSLO%2FWWAqDUf1XCsECSIf3V1Smkxrz4nSV1jqccNQZXFYzwmfEh2evzwcJSq5%2F9uSpQMokToMJWoaIdQhJao0RGuLhLFLWLaLn1eIr0W19N5SKy2rGsbK02LaQYSZ5xXqh0SVJ9WeRqmvg0aZI6RRqRNx%2FKFkYzhqTGlaHF4Z7KwxbnSPNuQin1p0T%2BNFlo8TeNY6XiylF1n61o7cidLz6mG5E425XcIHfjC34aFczEEgXof9srD2nEs7Sc5VjkuS1RFyrh3VgcYRVju6XNbUXrOTuKXVecYTulhpsMDygljHyEnlhwjKNt7btDar5c6%2FufM%2Bkkjx0O1k6O2Jiv4qiIpqGrOqd3VwsiKfAlnpVNwxBhN3DqUIH778uMrCbRpv902ETtHin6u6PUe09E4fokVhDu1pgfE6aMHA5rvzG1G8mL%2B4yvJGhnfXcPnpCU5PpvrgcKa646Rx7jTVZTFm%2FZnq4Hp1ByAI9%2FIW4Jdp%2FfckOpEKG33QFvzdkAy6PVkxT5KslC1uysD5FPzSPIcnb55caZ4CheqSAqlNA7lGWppH3WGrGao0DxGAzgP4jPUrqTHJC1NTTaqTCeSvRJwiH0T%2BnMRb9EH6yCfJYRh4MFJr8qfmz6I%2FXfKnjtFsIw9anOfkHA6Nk6CNnpSzZkzuZCowjcLh0DideJwOB0XbzsSGcjhcXv24vLpHv5pzsMstSUUbS0EvojvCv9tzLuV1JEIr8rCca6tFfDTB99W8ojgE9s%2BRxt4z4zqZcjLIxOD24EaksGOUqMdvMWWOawk%2FWp4RP1bt1YcINMhGasHUj37MRqnas14%2BHaPxMRumVOior2M29OKxGeM4ZsMY9TEbd4gKHzbRlT8gP1jy3ZoNSWT%2FeVW0tgw5CB6iJ5o0Oi9Mopzlk0VwCLSWv0vpP%2F3GPHb6Ti%2BpJIxsStAnuSL5OvP0rPtdh6vtdeDXqEvO62K15LyYRjUWsjuEDvgUfwcPXnI%2BoyFvt%2BQ8P1N5e13L3u3aRVNPJkb0LVMMlOFscoIYucsmjivD2ayvPURPSucQfit4IiefUzgS%2Bm4FIdaQprg54wH156ZbWBeZUyxMVSoZKeQneyI7EJ3clrbxjpzhIUmHpWi5XRt7a%2Be%2FM5AU6yPhX%2B8wiuNVY7UmWF5nMak4zdX8iQWF8P3mGksO1FaT1XxhSViAohrvyACWt5WL%2B4hitIryJ8M0Odv7WKePt%2Bz%2FjQxs0upo81M4qdxMi4emibMm5xyF1OLUuWjFNwEMWgN0P89M5%2By6Sx4simqVCQ96dnkzHpyNexBL5x9%2Bkyoj4BGCGIzT0klPiJlsSv%2B9e3FXKwTfbxtZLzqtD5YshQJzwweiu1LHBjPDroDLuC9KRKGmNQ16HH8ElisPibM7cUZhPJu%2FjHuc0h7jPGnTdslgkF7urAZxNHtB6kgasdOGlFcuFBEZESmzwHJSKvD4plwzvIrM%2FRaAO%2FnC7HqfoRSpH3l3vEznhxQdmsxAqzHdJUWemgYsJAAhHJ3Zqikvx3v7JwweWSlUViyJjProWZJJPSkyg9aRC2Ye%2FqcbuaArVYLRZ%2BTCjfDRWdwasfewuLNub%2B6vf7zcTOsDFyqoRegsl4%2BUDUwKj118QnMiPm5Ql3VXYRdWLbtI6sbnFolb451wiVJTpdR8bWl5HlYUSWYnnEQuchK5ghdKGq%2FJcwMcwki4SCBXkIAeRj%2FZlHNt4EbZveW6AJ1Rio8wqluK50Epe7Y7A90hXmOtLEPm7K0pNDeNFK48BQNrFESBsyLyK1fxbrM4g8uKewqIbYxxR0CuzNiRxy6Fh13qkbCrT4%2FmVvR%2Buw5NLlhO0BOVGMOvmVNly3Z8a16nrcB4s26V0Q5sQH9PByym7NvhyS88DtPFgT5cSjbkOScuM%2BdfByVb2Cij2N6AhauMnKJYPW6KL%2F9oR8LO%2BuM6Mkv5HzEGp50PbN8Y0el6riOaaJggW1aX%2BdUql2UhVpOaLeLRtMuqmf2S5fxSsxNiQZOQJMETGYMXl0HDMN7QkvEsy%2BnBvgXfiHn4knEtyydT36K7HL1qgGHtyuZNzd9N7%2BNX948r6%2F7%2BKXgOnj9%2B%2F%2FLPtKmlOT2OaOjM83Isr1zIJN%2F5vKL1k3meHDpEbRyfyDFJjat4Batnuh7RT%2FIJesKSgNEvF2uWNKYSyBFt4ks3ImoN8FGwJtfJrKjFLbONS1zreD2ynZxxvJyQrgvDFvzipipzzC0Dxy23oo34IgsnLtPLtod17%2BmJ42Js05zm1H8yksR1aWTVWTrEhhL3lPtHhqHz1o3iQbspi%2BOytOrx72ViVpal9y5C1UxcS%2FW0V4iZJTo1MJniQ38oKrVt1I1cBuvQez4nKdco3s30N%2FjCrmKWlS1fTs2WkG8uJSi6XDoBO9uoBd2oKibIegc%2BGf5yH17A5IhFuqS21UFbVQhstJn62SQ1qkm5HKB5nJRmXSyqQJrZjwr0ee3F7t8XzYOA1sslCAkKSQJ%2BLYvX26Hf4CHMQm8184n7rFbHmW70HIgekFdQc1gx4yUZ7y97W8hPPbqcnFJTjvjhHXnWp1KzNauv1nlZDT0I6UjbGcULbsQSvcxBnSL7eebfTA11E0m%2Bpv9xaVbePq6VaViRT%2BWtf7jTK4381JkK5yGALl7%2BUjNGM5IOQingnCSHpMR0ASCllQKPIXaAT6YszLRiLQRFUbKmPAtUjZnICRkSDXV2LEYo1aeC1CFSbQwLfWvtlVu8zJb%2Fd9ksTA9i9AKW1sfCGWmJQY9Roi2Y6VWi7DctyZESJO6apUNEZFWjuOhh2NbDPxf3hM6mRRoEjBiPzANRrMCz79B%2BrYEfk2PldhfF2dbbp%2FQkOhom5BEhgfG%2FCEXRshg0tHVU9MQomjJF14BUxcJvFwa3o4sPrAQkc%2BrmSkQ2evme5lOkggOCPqIDwbt%2BTQ7iYyeUW6Tjz5f3OZRhqZ4LBGK81vsvyraqYO8ur95POGXEk3MB6cgoMoMo3n8EV7tLlmf5r3t%2B4gxCWoEDeMVPuL7trSHbJ5TRp%2BkKdC%2B2%2FR5urBKAHYzkcBdQIi4M6wAySwGhilgNC5BEns9uj7hGfImlsDgvmuJpLT4HkEidV%2F8H). Specify the desired analysis details for your data in the *essential.vars.groovy* file (see below) and run the pipeline *rnaseq.pipeline.groovy* as described [here](https://gitlab.rlp.net/imbforge/NGSpipe2go/-/blob/master/README.md). A markdown file *DEreport.Rmd* will be generated in the output reports folder after running the pipeline. Subsequently, the *DEreport.Rmd* file can be converted to a final html report using the *knitr* R-package. ### The pipelines includes - quality control of rawdata with FastQC and MultiQC - Read mapping to the reference genome using STAR - generation of bigWig tracks for visualisation of alignment with deeptools - Characterization of insert size for paired-end libraries - Read quantification with featureCounts (Subread) - Library complexity assessment with dupRadar - RNA class representation - Check for strand specificity - Visualization of gene body coverage - Illustration of sample relatedness with MDS plots and heatmaps - Differential Expression Analysis for depicted group comparisons with DESeq2 - Enrichment analysis for DE results with clusterProfiler and ReactomePA - Additional DE analysis including multimapped reads ### Pipeline parameter settings - targets.txt: tab-separated txt-file giving information about the analysed samples. The following columns are required - sample: sample identifier for use in plots and and tables - file: read counts file name (a unique sub-string of the file name is sufficient, this sub-string is grebbed against the count file names produced by the pipeline) - group: variable for sample grouping (e.g. by condition) - replicate: replicate number of samples belonging to the same group - contrasts.txt: indicate intended group comparisions for differential expression analysis, e.g. *KOvsWT=(KO-WT)* if targets.txt contains the groups *KO* and *WT*. Give 1 contrast per line. - essential.vars.groovy: essential parameter describing the experiment including: - ESSENTIAL_PROJECT: your project folder name - ESSENTIAL_STAR_REF: path to STAR indexed reference genome - ESSENTIAL_GENESGTF: genome annotation file in gtf-format - ESSENTIAL_PAIRED: either paired end ("yes") or single read ("no") design - ESSENTIAL_STRANDED: strandness of library (no|yes|reverse) - ESSENTIAL_ORG: UCSC organism name - ESSENTIAL_READLENGTH: read length of library - ESSENTIAL_THREADS: number of threads for parallel tasks - additional (more specialized) parameter can be given in the var.groovy-files of the individual pipeline modules ## Programs required - Bedtools - DEseq2 - deeptools - dupRadar (provided by another project from imbforge) - FastQC - MultiQC - Picard - R packages DESeq2, clusterProfiler, ReactomePA - RSeQC - Samtools - STAR - Subread - UCSC utilities """ ; ns1:keywords "rna-seq, bpipe, groovy" ; ns1:license ; ns1:name "RNA-Seq" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Bpipe" ; ns1:url . a ns1:ComputerLanguage ; ns1:name "Scipion" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-10-04T10:58:43Z"^^ns1:Date ; ns1:dateModified "2024-07-10T12:30:33Z"^^ns1:Date ; ns1:description "The simplest workflow among a collection of workflows intended to solve tasks up to CTF estimation." ; ns1:image ; ns1:isPartOf ; ns1:keywords "scipion, cryoem, spa, image processing, TalosArctica, TitanKrios, Glacios" ; ns1:license ; ns1:name "SCIPION: acquire -> motionCorr -> ctf -> report" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Scipion" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2024-07-10T14:17:34Z"^^ns1:Date ; ns1:dateModified "2024-07-12T10:53:48Z"^^ns1:Date ; ns1:description """The second-level complexity workflow is one among a collection of workflows designed to address tasks up to CTF estimation. In addition to the functionalities provided by the layer 0 workflow, this workflow aims to enhance the quality of acquisition images using quality protocols. **Quality control protocols** * **Movie max shift**: automatic reject those movies whose frames move more than a given threshold.  * **Tilt analysis**: quality score based in the Power Spectrum Density (astigmatism and tilt)  * **CTF consensus**: acts as a filter discarding micrographs based on their CTF (limit resolution, defocus, astigmatism, etc.). **Advantages:**  * More control of the acquisition quality * Reduce unnecessary processing time and storage""" ; ns1:image ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "image processing, cryoem, spa, scipion" ; ns1:license ; ns1:name "CEITEC layer 1 workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-10-07T07:41:21Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:52Z"^^ns1:Date ; ns1:description """# ChIP-Seq pipeline Here we provide the tools to perform paired end or single read ChIP-Seq analysis including raw data quality control, read mapping, peak calling, differential binding analysis and functional annotation. As input files you may use either zipped fastq-files (.fastq.gz) or mapped read data (.bam files). In case of paired end reads, corresponding fastq files should be named using *.R1.fastq.gz* and *.R2.fastq.gz* suffixes. ## Pipeline Workflow All analysis steps are illustrated in the pipeline [flowchart](https://www.draw.io/?lightbox=1&highlight=0000ff&edit=_blank&layers=1&nav=1&title=NGSpipe2go_ChIPseq_pipeline.html#R7R1Zc6M489e4KvNgF4fPx0kyzmSPTCbJfLOzLykMsq0NBgLYOX79p5NTYLC5kkyyNRuEAKnV6m712VPPNs8Xruas%2F7YNYPYUyXjuqec9RZGHitLD%2F0nGC22ZDGe0YeVCg3UKG27hK2CNEmvdQgN4sY6%2BbZs%2BdOKNum1ZQPdjbZrr2k%2FxbkvbjH%2FV0VYg1XCra2a69Sc0%2FDVrlcez8MZXAFdr9umpMqE3Fpr%2BsHLtrcW%2BZ9kWoHc2Gn8Nm6O31gz7KdKkfumpZ65t%2B%2FSvzfMZMDFYOcToc%2FOMu8GQXWD5RR44%2F%2B%2FFO%2F%2Fxv8eH1%2FXZ5Hopg0tf6geD8184LICBQMMubddf2yvb0swvYespmS%2FAr5XQ1X%2FbjcP7rzQHtYRP%2FWXbqOFcxt2A77%2Bwlde2vo2a1v7GZHfBM%2FT%2FwS8cjNjVr8id82f2LXLxwi8s332JPIQvf0XvhY%2BRK%2F6c52uuz0Yyxtf21tXBNXDhBvjAvXU0HVqr4FEj0hU9uQJ%2BRlfPd%2B2HAH%2FQ%2Bp6ml4itGv0kbdrpV99vf27%2F2J6%2F%2Fnt3eTf%2F5e3%2B7asM8egHczoO2QLiVYt8gqHABbDRQN0X1MEFpubDXRzhNbZvVkG%2FEIHQHwyHxPj0Vfpjub6a%2BuZufbO4%2Bno7v3%2F92ucbeqeZW%2FapnjI20QxODbjDXzThyiI3xo9bjPqnLtlWwSX6a8X%2BTx5buMkWNDDyLt6awN8IVpnaApinwRY9s03bJZ3UOfkRLtrStjh2yHjNDc1bE2SXY6iPr5bQNKMvlfBv8FJ%2Bh9AE9XTlagZEiJBo1u0N1BkGrUzN8zg2cXIhkc%2F6aPFsDLj%2BTMrDrB1wffCciwns7nSsDpQRfYqRbbR806DtKUIIJ2xZ1xEaOJSGg9HxaCTEavlD49GXMf6tA4%2FgBjGyz55DeahcDSIpUzmGRfJISmHQUJqlMWjM0eoY%2FHn96Z%2FfSf%2F8%2B8f85ou7kY27i5fLvno0V%2Bs8D6uQ1xRkNeqwalZDHv3sutpLpINjQ8v3Im%2B%2Bxg0hug2nihDd5hn91Uluf%2FQHHUGIbcFUCiHg%2FGasX%2Fxzvr4Bfz3qy18vd9%2FOXvvy6L1jYIyctYKOajfRkZ2FstAx1X%2BYj76jmZTX%2F2j0zVuDLO6b4KzJBsz5YrjPeTK%2B0fcI9n5GHeSh8yxg2GdK73OUSdPXpXh31Z9NzuZsfXndvwWP6Ilr6AATIjaLhnaaOwahlJExAQEoqWxz5AyOg0OfbCnoawv0NNrBfOrocA%2BW0IJEDEXog%2F%2FZM8dqFsRzNKtIm4alCxcsY19a%2Bz5WIXzG%2B0WZr6CPRLmBazoDC9EWZQ43i6WN6Az68%2BriFs9VWdnooo%2FX1rQXeJXADis75hwQiBTMMWp4GDPm%2Bho66K8BvztAEqK9e0nPJrsjh6AmQBHRPLObG8azTqOwaWuGhxU0GOBYHWMbWxMvnqRZBkZxz0OcCWom5VIaOc9XtIsr2IDPQN%2F6ZLiRDYhQFb8WWiuyNU888An9jzQ6GnQBmReZ3YlDbhnAw6ek4rPKPgJVck5IcDI1fUyQBQdNmZ8dKj9mDgVsLgECdHxy8J%2FoeOWggxaePBXCuLAz5Q2BfnAoBFcuny0MQzkFMkUAMd5WTvJJiR79hHpgEn%2BBvVx6mIomFqEC8aNd6Zn%2F%2FSsmSe%2BTnmOycyhKN3B%2Bk5WiEnPVAnPG5vvhAffb4j%2BsZECEGOtOBDKkgDwu7GdMHYlGFdPHhe0awO2jZixrYBBxxCfUU0JYPgruRDXxn4lsooIh%2Fg16OJphBO9W8thPK6IEXT5v4D8LtFjxm3kig4AfcVkuDfFscCZZFdhEByXpaLG9QPsd3pJSvQKoR3op2aD3F7bxkmpMcS7f4C2X17n8LeyYfpS3wOi7ECJt8PwhAZmFBIPI22ChL%2BDG9Ihz5xD9TkXz8LSNk5qD8N11zerq%2BsddhZOynK2ftz6Nz63iRWPza33dMBV1Dp0Fe5gNXrKXkRlVMXDcKCIRhIASMheVZfOlDCXC%2FphYEpE4sFAGdc38zCwNPhYjTrndwQRL%2FBkb9VqaRL%2BOlfi9mEBRhcycUK3LM4EAKJKZuVBQgm2jywjnLiGuKdPf4lpxcY0fMt6MuOYIRYcNGj60qHQgOaGERtv7ZLfge%2BPIPYT0fp%2FtoM9EPrDoqXufLu46PASnDuuF1XBOsm2d6nXQVCkpwDenYqHyNKIq64zmKlCADHaa62XqrbK6pUVQJnXnLECG2iq1LvWsQoJROa69gwbYpwoScavo5E4D3dIVejAGpBhABN%2Blj%2FWxNRYuoR7%2FunRiO%2F7gU9FhiIC5lwEq%2BQywDKt7WkMfYHcY%2FOyTS9yAqmd%2Fw6FI%2FyFgf7PyjgmHcz%2BRsaRRLRL%2FYGfVSMP4KtagRhI6t8lyaiHenVzSlCtbzNx7uPCjFrbujjsi%2FQSyxVzz%2FO9nhKoXETwet4h%2B%2BniEuo3WzDb5acjVnsiwiX0CDxbx5oce07cneP56uxggmpHD7xmv36DBYbYxD%2BwcuBvW76Mbj%2FqANgfsey9roOuUyRpE9CzBLph26C9yda6OM5gE8ZJiL5YqPz6NuOmAn5%2FG0xTlk8dymvSNy3smFWEgQmclfqbLIFPAXNhPrVOo%2BijCsCBF4F5lrft7xK1YQ3b6rcofQ4gie9y0f6MIs%2BfNOoki6rQJFMkXdj4IilS88mlvLDlxIpETRlEmVtGnwjU%2FGoVG1aLQXOS02KoaT4qh0eQgcVndKy5X6KZYmChVjppFfR6Eiyz%2F9o0%2BcpWFYG3quFJskdXjF7nSsK44j5h0X0lfeHerHRE5xnF%2BobCDT6YXcrI%2FO2%2FXyl%2F4mb6bWHkYUu5jOfW5yRelTB1B0KTz32yYj6CJ%2Fspo1gCC5p%2FEfyNoPQLS5F1g6LhiRYAQQ4f5ioC3gaFyHEOVfYy9AyhauXjXEooqx6BoYVX5wn7yIVAK68p5S8R38QZoRg8bdx0HmyX2WJaz4hBo04lHXJGwVdWhgSzYLf5T1vN7dPIH2%2BFD1XxohWeQKq%2BfH0awpE79fMXuTDMljsBjtgNj%2BniBKbImfXyu6UkQ%2FbbfM4GEdDD%2FBOkA15dvW5%2B6AeIcJqhDuIUi%2B0d61PM2Vnu%2BLhSN8WYLpgFp1A2NXQnHXcR54XhnvGgMPo%2FJn2sbaGL8%2BwrMHcBv7TXhyjAZTmOor0rjNOrz7XEk6hfE80nH8FxKoboLvK2JOWIXkf0EIRmaGzA%2BEcaiPYTo3qxv0zlcLoEb%2BjDRoZjQ8xsfyjX9tmZZPGNJx3Z3DTt5OBZkTBHtZB75Xf1OnnZ%2BJ5v2qqPbGI2sz7mUbpsmkhZIJCXlWEDT1z3uq%2FcBkHmsFkTmKtjSn7uLe%2Bfs7od%2FtQHj6832yr%2B%2Fajd3SxNOW4efV%2Fnpf%2B95ddaaOSJ33J2hT2mJ2gUOQpuOkqi4C3nN7uI48SI%2BpAYQmXtraL3cswwG97QdHR3vcU92mqWNg5uNkR596n5m%2BGLSsbrLMx6QPujWYA1MB7je4EY8dXHHvBDOdhzi8RohmhTsBbxjXOx7GEtBEJVvCweltTMhxLPh8oWwMh%2FzcH3r%2BXQ%2B3M0%2BTAfhI%2FTy1ohA4CaI3TQtADBHKTyxd39gncgFJQNZqiAt4Fyo%2FT7eobt97XdJaaE25ffw7bpiX1qoJ9snBVXMeIo4axF%2BEEMHP8sJ3FjbYESwFh7dOvEQ49TXaXIV0UeZmFOl8jjTsZtOBc%2BjtPJ42JRzd2TLJFJcAvLTS6a4RHfUsTpTjV4kcWVNvuAqv47Ss5EoTWU9umdxyMrwvZ9%2BEvSs%2B6mYh6M3SyedtWYhkYcoBIsSyu9nWOJCI4C6R6kfpY70X0wcoWVQk5y%2BRoKUjoAPPT%2FonaCVXhat3EMoj4l8Cc1rEQCUp5Kj31QS%2B64LLHQNUkmhHMj1nd2UAw90b5y0JgiOCxK46h11jiVw4NkHLOFTEdr2hXan0Xf41cBakaMhWlg0wN2KiITaagOsiJDYDaJGpT464fK0bNw8LYtRpvKErQJalvQ2GEuTNC1r0NtAHLfzFqL%2FqnPJrijeh5OivTSL56Zv2XNroiY9sY7K4lzcE0vbnGEtE6l%2FU5BK6og6%2BiQBle1uEBnwiDlsAVdPEBNIH8l8D02ELqOx62zs5QnepHmCV4P0Neb5UFuiWD8t9eHp9PHu35%2FX9zfWn456ubv%2FXVyhFKGapgmVEKqV06mjNK0812s3JewDBewmA4hE657rhdW2a3EiIGi4z7U40X%2FEMKZuhoaP8vcR96LCTA1Res2FHkleT20%2BO%2BjhHB2vzE%2BpGYVEfPjl%2Bdq0W4L8coR%2FRYL8mPz0qAkrKuCTn3oE%2FGHL7sRila7SJr%2F8rdLNyRPxFlW6C70wzbs%2Bu0FvO7V93wQW0LFj5pkNlkuoQ8DSEzRA8Rblsw%2FR9fnoqtfhSGBwb91A9e5T8785ajYq6hXYPWoGHQQIqoQtStQc08ZvpSnO%2BcPNkLJwtKUp2kjqNU7RaiBJI255aYkkiYs9vvtae5XKPoVjXqcNUYuCGolO2%2FzegkaicEqTjmok5HIpTZrSSGw03Sse6syCs3TNNIkTRSOMiwyxvBT%2BPmKM1anAz6vtGGNulupMRETays3tPF0MiFjA1c%2BYMYpFazGVnhcLPexuqHEF2J700h6NBTbuuuK3FpOtcWFf%2FeOATf%2F7zcyf6Rfj%2Frs%2FIFbJlKeFmXJbwljusA8iX7VXC7kBJBxaxzbsC2DZsRJeOby5fbpGw55Ov%2F28u%2FxC%2BDPaFc%2FE%2FL6KzSMZKMXvSjjdN46WQnx%2FSyHQdBAOmQC0cTmLaBQ42h36A%2FZCoKWh8mOI2q4Iwjz%2FTVsnoycxQXgO%2B4qhHFSio2MVN%2BLC00SYpUJw4Jfl2mjNSEBrEoCrt75GQO0%2Bbn0NYaR2q7EKcb1LwNxL6V2CRGQNcPqZwMtWqOjgWv%2BOsPpZdjakztdxDXh9JASZoFL%2BoeajlVClIInnbEL%2FO0FIbgBaVpxFAidDuXlo7YMFffdTeL2EFklFE%2BTvoaVPNIJuosoo1CNDQ0%2B9eNAjgAaON4gsSpEqlAWLcPbKVsQs%2Bf4PMrBeqVKclVbWrEn4mfFSMLkhz8IS9XWFPKdlndZO028gbbcsFw7W6xSTDcbdij6QVx7OT5CiPRkaPlt2UR9Iqlz1wmyDJ6%2FQcUhitpbPlOisy2IlbWuHz47kLEyVlY4GKXNNhTB1J4cGLd0sD27kAYHxYPXa7XEqB4zzTSuAlTjLkkVHT1GFzFlNpIznUO%2BwZQMnFY6ldukQJQtTTxRWkb1p9E3aLxBYBCKXwH4xqk3iOt6A0cHg4ialsJlcUAjjeeG645BGcnXdsDxgRW36DCCYyVP1MWHvaL8%2FGPaT1QsTauFRV2Hsz8sozrON4RmUNvnPovB%2Fuyb%2FybBdk%2F9X6Y%2Fl%2Bmrqm7v1zeLq6%2B38%2FvUrD%2FAS8MUUamHaK%2BQtSZXSUMRC%2FgIrEuUeoC59XYqFLDJZCGa38UWKe0FbaPF7cZdp1sSxgTFo9TTJjTbQMAjFFGFFnIpWwV24byyXjtQUXgxF0pFSHi%2BKIUG6VvgC7%2B5eJMNsT1GfP%2FdoemmTnA9IPjrbwYcHzfyUWq3DdmUV0E0WKRTsuqEAusO6oHuMVXqcFD0jSlgOzbCRwFV0LImlEoskTgwXp3UR80C5sox%2BrgrkmsRdj7nabS9yVSAXCpGrSTNkLnZ%2FXCukECxv0D7F1ONzamjJ2Y0fzRKVC4wPa2jp7MBatABVwGFmk8PEF3WYTRuP4jBt1mOB1AYwtxMatE46yX0AvVgSOQuLP%2BOakDO7xEigXb%2F0ETT1qMt4qHgny7n1MEZJDNeCFNvv9vQ5ies25aIUprbjZ1o3f06jXYleClBHRwNBjOR%2FEjlqEL%2BM97xkcSlWFmiSFOGSVWBPEbrTv%2FuEwAdqnoXAKqp4bsr6v0%2Fv3AFn%2BeuojsJawtXW5U7PXbSPxQuDoFOLlfxqzcVCKJBwO%2B42oJehK3jSnCfsxGEoKv4hmlKz3vtNA9JxgbZBpwUvE4qCHnkgbMcFIQ48kddBqqFmCHP1Y9xARKrdZII6cTcPzN3x%2Fkg1GGBJSdqT7T6YNislbGkrIn%2FumcP%2BdSvR4ALKtLwS3z2pboAIENrWJKUv2AgYLD1z68a0QDjtOy18Y0dlc8nb%2FxFSWgM8Ax3JEW5snXEEjRlbwELzwp7E1CNYdDeM0wnKzQYEIniQVmqKhfCcELPsAwAYJ0PAeEgSgtbK%2BzTYM7zE0TJXasu3o3bA1TXhNjQapuN8hH5D8kgpLQUVMZkKoagIzpx1qdzz5MkPrHKfC%2Bsw5eesfpO5OJo8phTODVp5CpaDMnEkk1dPWMadrEwcqWTX41ECC%2BvJxMGqtxKz61%2B43mzx5KDOC1kPjxROJqVqaU2AQB3KGrlWVOnVlrZjAz1satEsYG89IieTWZGTRODRowh9erKR7c259IzkOIOaSIJQjAZdesR0MM2PWqWD0qAUJUQXQXq6JHWU26OOohgOIfh5cvSOZKjiBfXePzYobaNDrttpV9ChYwnLKkSHhOiktowNeTJW88iQO%2BxCR5mta76c4sRGeIL7eG2IMfSK50U%2F789Kwr44cx5ORoNR%2FOgxkwaiWMk0f1YntWUIfIenkrr89vNKULzVBIGjSaymTmcSBBpwucRFEwsfR%2FADOIERzjTTixZcDOPj95w%2FqigzG2qM%2BQTywgmyEarZo0d19QmqTu%2BdDFIYqTMRyRSeacqTTMGZZh%2FCdtZdsmZzSYD7xPYRwXfixoZpoXcfbAH%2FWeD3ndErz3jy0Tw7b%2BEGmppLyNYDCLiMR0GFjQ8LomdhCUTwaGi8A8aQMGcq8x9aYt3MSWCjoAFXGPoZ5FKyLfMFK%2FilNLWNLEqF%2FpiBAeEaDZ7iJ8uG0gsSrLCkKonp5YxPZJWoa8xndDhk4DhjW4%2FlcGlktL1SjqX5tWJzeVBnU5EocWahKoqAWYgtNNJkUD4G4je%2FqIRfEEc5dH8fx8js95tn5NClvXT%2BjJB65stEAEwoFmU7uB13QALsgE79z2877%2BcdGv%2FJn9%2F6P%2B%2F2WoDroLUXCNmdXpAVE%2B9Zlm0yxiE7RlXza6O%2FFaqqDAtmy5g0aFRot15Yh8tZCKFVNKtiU6XLC6qq5KPXuHOqqpo0VUVXPTcZTcc0Vcq4XCkLhVVuqltThejwwn7yIVC2FnzcgsIaKxdsEDlHb9%2BgAw7s4wQ7lDW7QDNqUFflJcBITiJPa5WNXM1qrfJKbBaoRFeDdkoYDde2vV0%2Bnmx%2BINY4KUgkeWGgrvDG4%2BWfD80biy57R3mjOixXePpI3lgMJysoMde9nGD7zPaJQpgtEKbKw7QOw9BkmpvRHukt2b8hO%2BPN5nzrFHd6DGQ2Y%2BuYJOTTqE9gE9gXUVf0r7tBn89NVJaNQs3KaOhlUOf0unPymiq3W%2FpczMqlNuW1dhh3jGoeTiNnRbl452r6nmqb72eFqZAoxpzYmcJEqHup0THe1pQKoY895pYpz16jt%2BdZnajpK1KCytzjoC3KMTqectQocCndIiYVy0dpgSaRRXE8m8Rfwaqi06dyJKMJjzLjTEtNZCthNdOTLyodsCJNhQPOHFeiv9qQvu1vrC0rQSy97WajuRhXFen7WS%2FIE12%2Feo2SSaLdO4BQUkf9N0cpkzLWeKp0T8YaZVeOoAb6dPJKl4y0XF6lmO05QZkjUCfYfhrY8CPC75z8CKlcNMXSOEn14v7NCVn7yxj%2FpmRtlltm5WoGBKF3H2uOCvIrU%2FO8pFAvVYM%2Bs6RmQ1SIWFRNRpUqSOIkRpZ9yRJLZDolT23NZIsZhr5rT%2Fh0SYqGCHMJOabtR9OMmCnbduRlgAXxc8uChx3dPT%2BebyrvDZhAhpWCf5zdYgLKy4NKCM%2BeaHaqoKQwCQBkFYUJyWVj0LHlmpSyzNkveSOBFvoSCT2n701PpfQrNV0HnkchY8KFq7kU1LgqxjOF%2BwL%2Fc312g%2F%2B0fd8EFtCJJ5cNlkuow3hu2ryPYZHDcqG%2B3pBn%2BDoyyEaHcnlNpoc20Qq79RV6u%2B7ankfG5VK5hZYPZd9ILYqz1iw%2F8KMrigzEfc0IH5K4Iw3NHvA%2FYOGPGlBDNGQTzo2WGcFHFIQCpuYUXZ1YIdQYtJZA87cuUcMgFHDhYsu63c17UR8X09ZhMDzug5dAxD2jwL13EDylxlDA37wUWqLGNF3Yw0OOzyDGJIUj8ofFtDpRLlEFM%2BCnrCDZpCDhgixgBsqs%2FFkd69Vs24%2FKq2i%2B679tA4uiX%2F4P). Specify the desired analysis details for your data in the *essential.vars.groovy* file (see below) and run the pipeline *chipseq.pipeline.groovy* as described [here](https://gitlab.rlp.net/imbforge/NGSpipe2go/-/blob/master/README.md). A markdown file *ChIPreport.Rmd* will be generated in the output reports folder after running the pipeline. Subsequently, the *ChIPreport.Rmd* file can be converted to a final html report using the *knitr* R-package. ### The pipelines includes - raw data quality control with FastQC, BamQC and MultiQC - mapping reads or read pairs to the reference genome using bowtie2 (default) or bowtie1 - filter out multimapping reads from bowtie2 output with samtools (optional) - identify and remove duplicate reads with Picard MarkDuplicates (optional) - generation of bigWig tracks for visualisation of alignment with deeptools bamCoverage. For single end design, reads are extended to the average fragment size - characterization of insert size using Picard CollectInsertSizeMetrics (for paired end libraries only) - characterize library complexity by PCR Bottleneck Coefficient using the GenomicAlignments R-package (for single read libraries only) - characterize phantom peaks by cross correlation analysis using the spp R-package (for single read libraries only) - peak calling of IP samples vs. corresponding input controls using MACS2 - peak annotation using the ChIPseeker R-package (optional) - differential binding analysis using the diffbind R-package (optional). For this, input peak files must be given in *NGSpipe2go/tools/diffbind/targets_diffbind.txt* and contrasts of interest in *NGSpipe2go/tools/diffbind/contrasts_diffbind.txt* (see below) ### Pipeline-specific parameter settings - targets.txt: tab-separated txt-file giving information about the analysed samples. The following columns are required: - IP: bam file name of IP sample - IPname: IP sample name to be used in plots and tables - INPUT: bam file name of corresponding input control sample - INPUTname: input sample name to be used in plots and tables - group: variable for sample grouping (e.g. by condition) - essential.vars.groovy: essential parameter describing the experiment including: - ESSENTIAL_PROJECT: your project folder name - ESSENTIAL_BOWTIE_REF: full path to bowtie2 indexed reference genome (bowtie1 indexed reference genome if bowtie1 is selected as mapper) - ESSENTIAL_BOWTIE_GENOME: full path to the reference genome FASTA file - ESSENTIAL_BSGENOME: Bioconductor genome sequence annotation package - ESSENTIAL_TXDB: Bioconductor transcript-related annotation package - ESSENTIAL_ANNODB: Bioconductor genome annotation package - ESSENTIAL_BLACKLIST: files with problematic 'blacklist regions' to be excluded from analysis (optional) - ESSENTIAL_PAIRED: either paired end ("yes") or single read ("no") design - ESSENTIAL_READLEN: read length of library - ESSENTIAL_FRAGLEN: mean length of library inserts and also minimum peak size called by MACS2 - ESSENTIAL_THREADS: number of threads for parallel tasks - ESSENTIAL_USE_BOWTIE1: if true use bowtie1 for read mapping, otherwise bowtie2 by default - additional (more specialized) parameter can be given in the var.groovy-files of the individual pipeline modules If differential binding analysis is selected it is required additionally: - contrasts_diffbind.txt: indicate intended group comparisions for differential binding analysis, e.g. *KOvsWT=(KO-WT)* if targets.txt contains the groups *KO* and *WT*. Give 1 contrast per line. - targets_diffbind.txt: - SampleID: IP sample name (as IPname in targets.txt) - Condition: variable for sample grouping (as group in targets.txt) - Replicate: number of replicate - bamReads: bam file name of IP sample (as IP in targets.txt but with path relative to project directory) - ControlID: input sample name (as INPUTname in targets.txt) - bamControl: bam file name of corresponding input control sample (as INPUT in targets.txt but with path relative to project directory) - Peaks: peak file name opbatined from peak caller (path relative to project directory) - PeakCaller: name of peak caller (e.g. macs) ## Programs required - Bedtools - Bowtie2 - deepTools - encodeChIPqc (provided by another project from imbforge) - FastQC - MACS2 - MultiQC - Picard - R with packages ChIPSeeker, diffbind, GenomicAlignments, spp and genome annotation packages - Samtools - UCSC utilities """ ; ns1:keywords "ChIP-seq, bpipe, groovy" ; ns1:license ; ns1:name "ChIP-seq" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T12:32:21Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:39:53Z"^^ns1:Date ; ns1:description "This workflow uses Illumina and Oxford Nanopore reads that were pre-processed to remove human-derived sequences. Two assembly tools are used: spades and unicycler. In addition to assemblies (actual sequences) the two tools produce assembly graphs that can be used for visualization of assembly with bandage. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input , , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - Assembly of the genome sequence" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:ComputerLanguage ; ns1:name "Bpipe" ; ns1:url . a ns1:ComputerLanguage ; ns1:name "Scipion" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-10-04T13:11:55Z"^^ns1:Date ; ns1:dateModified "2024-07-10T14:00:07Z"^^ns1:Date ; ns1:description """The ultimate-level complexity workflow is one among a collection of workflows designed to address tasks up to CTF estimation. In addition to the functionalities provided by layer 0 and 1 workflows, this workflow aims to enhance the quality of both **acquisition images** and **processing**. **Quality control protocols** … **Combination of methods** * **CTF consensus** * New methods to compare ctf estimations * CTF xmipp criteria (richer parameters i.e. ice detection) **Advantages**:  * Control of the acquisition quality * Robust estimations to continue with the processing""" ; ns1:image ; ns1:isPartOf ; ns1:keywords "image processing, cryoem, spa, scipion" ; ns1:license ; ns1:name "CEITEC layer 2 workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Raw illumina collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-09-18T13:01:50Z"^^ns1:Date ; ns1:dateModified "2026-01-20T17:16:34Z"^^ns1:Date ; ns1:description "The workflow takes a paired-reads collection (like illumina WGS or HiC), runs FastQC and SeqKit, trims with Fastp, and creates a MultiQC report. The main outputs are a paired collection of trimmed reads, a report with raw and trimmed reads stats, and a table with raw reads stats." ; ns1:image ; ns1:input ; ns1:isBasedOn ; ns1:isPartOf , , , , ; ns1:keywords "ERGA, DataQC, illumina" ; ns1:license ; ns1:name "ERGA DataQC Illumina v2601 (WF0)" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/HiFi raw reads collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/HiFi trimmed collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SeqKit HiFi raw table" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-09-18T13:14:42Z"^^ns1:Date ; ns1:dateModified "2026-01-20T17:18:37Z"^^ns1:Date ; ns1:description "The workflow takes a HiFi reads collection, runs FastQC and SeqKit, filters with Cutadapt, and creates a MultiQC report. The main outputs are a collection of filtred reads, a report with raw and filtered reads stats, and a table with raw reads stats." ; ns1:image ; ns1:input ; ns1:isBasedOn ; ns1:isPartOf , , ; ns1:keywords "ERGA, DataQC, HiFi" ; ns1:license ; ns1:name "ERGA DataQC HiFi v2601 (WF0)" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ploidy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run Smudgeplot?" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Trimmed Long Reads collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kmer length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "genome_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "max_depth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "transition_parameter" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-09-18T13:27:38Z"^^ns1:Date ; ns1:dateModified "2026-02-11T19:40:17Z"^^ns1:Date ; ns1:description "The workflow takes a (trimmed) Long reads collection, runs Meryl to create a K-mer database, Genomescope2 to estimate genome properties and Smudgeplot to estimate ploidy (optional). The main results are K-mer database and genome profiling plots, tables, and values useful for downstream analysis. Default K-mer length and ploidy for Genomescope are 31 and 2, respectively. " ; ns1:image ; ns1:input , , , ; ns1:isBasedOn ; ns1:isPartOf , , ; ns1:keywords "ERGA, Profiling" ; ns1:license ; ns1:name "ERGA Profiling Long Reads v2602 (WF1)" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "(Trimmed) Long Reads Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC F trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC R trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "The Long Reads are PacBio HiFi" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "The Long reads are ONT" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-02-11T20:03:26Z"^^ns1:Date ; ns1:dateModified "2026-02-11T20:05:30Z"^^ns1:Date ; ns1:description "The workflow takes a trimmed long reads collection, and Forward/Reverse HiC reads to run Hifiasm in HiC phasing mode. It produces both Pri/Alt and Hap1/Hap2 assemblies, and runs all the QC analysis (gfastats, BUSCO, and Merqury). The default Hifiasm purge level is aggressive (l3)." ; ns1:image ; ns1:input , , , , , , , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "ERGA, Assembly+QC, Hi-C, HiFi" ; ns1:license ; ns1:name "ERGA Long reads+HiC Assembly+QC Hifiasm v2602 (WF2)" ; ns1:output , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-02T13:04:04Z"^^ns1:Date ; ns1:dateModified "2024-03-13T09:53:44Z"^^ns1:Date ; ns1:description "The workflow takes a trimmed HiFi reads collection, Hap1/Hap2 contigs, and the values for transition parameter and max coverage depth (calculated from WF1) to run Purge_Dups. It produces purged Hap1 and Hap2 contigs assemblies, and runs all the QC analysis (gfastats, BUSCO, and Merqury)." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.606.2" ; ns1:image ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "ERGA, Assembly+QC, HiFi" ; ns1:license ; ns1:name "ERGA HiFi Hap1Hap2 Purge+QC v2309 (WF3)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotations data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Report germline variants?" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variants to be annotated" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "annotation_metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cancer_hotspots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cgi_biomarkers" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cgi_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "civic_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "civic_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "dbsnp_vcf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "final_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_germline" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_loh" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_somatic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_reports_tabular" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "germline_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loh_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loh_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "maf_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mutations_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "somatic_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "somatic_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "uniprot_cancer_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "variant_reports_tabular" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-10T16:08:59Z"^^ns1:Date ; ns1:dateModified "2025-05-08T10:02:07Z"^^ns1:Date ; ns1:description """This Galaxy workflow takes a list of tumor/normal sample pair variants in VCF format and 1. annotates them using the ENSEMBL Variant Effect Predictor and custom annotation data 2. turns the annotated VCF into a MAF file for import into cBioPortal 3. generates human-readable variant- and gene-centric reports The input VCF is expected to encode somatic status, somatic p-value and germline p-value of each variant in varscan somatic format, i.e., via SS, SPV and GPV INFO keys, respectively.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.607.1" ; ns1:input , , ; ns1:keywords "EOSC4Cancer" ; ns1:license ; ns1:name "Cancer variant annotation (hg38 VEP-based)" ; ns1:output , , , , , , , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hap1 GFA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hap2 GFA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC F trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC R trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-10-11T15:06:56Z"^^ns1:Date ; ns1:dateModified "2024-03-13T09:55:11Z"^^ns1:Date ; ns1:description "The workflow takes trimmed HiC forward and reverse reads, and Hap1/Hap2 assemblies to produce Hap1 and Hap2 scaffolded assemblies using YaHS. It also runs all the QC analyses (gfastats, BUSCO, Merqury and Pretext)." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.608.1" ; ns1:image ; ns1:input , , , , , , ; ns1:isPartOf ; ns1:keywords "ERGA, Assembly+QC, Hi-C" ; ns1:license ; ns1:name "ERGA HiC Hap1Hap2 Scaffolding+QC YaHS v2309 (WF4)" ; ns1:output , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-10-07T07:43:50Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:52Z"^^ns1:Date ; ns1:description """# DNA-Seq pipeline Here we provide the tools to perform paired end or single read DNA-Seq analysis including raw data quality control, read mapping, variant calling and variant filtering. ## Pipeline Workflow All analysis steps are illustrated in the pipeline [flowchart](https://www.draw.io/?lightbox=1&highlight=0000ff&edit=_blank&layers=1&nav=1&title=NGSpipe2go_DNAseq_pipeline.html#R7R1bd5s489f4nObBPoDvj3ESO%2BnXZpuk3Wz3pUcG2WaDgQJ2Lr%2F%2B0wgJIxAY29jGabZ7TowAIc2MRnNXrXkxfxl5yJ19dQxs1TTFeKk1L2uaprY0rQb%2FK8Zr2NLtdsOGqWca7KFVw4P5hlmjwloXpoF94cHAcazAdMVG3bFtrAdCG%2FI851l8bOJY4lddNMWphgcdWenWR9MIZqxV7fRXN66xOZ2xT%2Fc0Nr8x0p%2BmnrOw2fdsx8bhnTni3bA5%2BjNkOM%2BxpuZVrXnhOU4Q%2Fpq%2FXGALwMohFr43zLgbDdnDdlDkhaV%2Be%2FfwuPi8uHz79%2FvN9%2BFPf%2FlvXeUYWCJrwYBR0zoW6XFgmEsAr2VObXqj83sBYx14FA7RJfk1ZX%2Fpa2Mv2ULGRPvirRQawSsH%2FiyYW%2BSXSu5ZaIytQQTTC8dyPPpQc0j%2FI4%2F4gec8RVgiUBxMHDtgJKV2YNzIn2GD9Uj7ia4mpmXFOr3qwL%2BoU36HIrE5mHrIMAlwE826Mzd1cqnAIxbyffY7wq8STTKOHIavJfYC%2FBJrYsgaYWeOA%2B%2BVPMLuaj32CltTmsJQ9byi0JbCCHQWI84OfxCxVTGN%2Bl6RB%2FnBKEROLf9i21w%2BXM8nj8355L63%2FPEjqNc7Kdxhg6wjdul4wcyZOjayrlatMQQAXP5bzF3%2B%2FBS5pGX11hfHcRme%2FsNB8MpwihaBQ5piVIJfzOAf6LDRZlc%2FWffw%2B%2FIlfvHKL2wCgNhLcPmT9wcXq9foFX8vTW4hDGDimQuRNfnOwtPZU%2F3P6rBl%2FfwyuhtMHr%2BPTHXx2Ku3GatD3hQHOXBvygnFwxYKzKU4jl2Qfq18nsxue4G1nN2Pb68fhr%2FerusfHGIvHMJzAoI8BwBX75fEMnrtrsAy6n0txTLUnpZmGS1F2516rkZfR%2BdB%2F9v8WRuN0C%2F%2F4n%2Bvn%2BvdnVlGaQyC35EziKyFvmZdS3fVZj%2B9sKUPthQpNznAUn97DC6%2FK%2F%2F8%2B3l4f%2BXNVeP76PUm5DOngKwsbr7i35tx862QXBDHzValcNxW3zuSsW2cg05ALnXgvsCLoXFoWnw4aToQtoQDEEXzWESRN2zJFg9bp0AtfFOHG3Wf4vucPKC23JecHT9quNBq54PL2%2FP6A%2F4NMDZdbJlk1yTtg7g4MC4kNISjE5tDoWTHAe827TrFrBmgMXk7NkmiIOOJaZt03ycosddMpRzg%2By6Cr%2BlMElk9qPt%2BPfAWto4CGFv8ss4oWNKpEht02HUhtFRyLmGnCKQhD0%2BEDmdBAMaHc1hY2nBqBkTybHiW27Chq6E5H08c0i35eTt6AAxrU4dc1IF2LWcMEi1egplkyNHvk9%2BE8n0g%2FKFhI%2FKjwe81iDjrLF%2FTY8x8jsMaSeC%2FAVqSknel1pDlIMMHKwtMH2wqjrGwAI4Ksg1YY75POLSJrJBbI8IMsXfKE8YvWF8EdIYxpkEIDbo17SllJ598fEb%2B0EYXmR6moKAA%2BeTSWwb2QSXbBhDlLNPUJ7MVulIsJYqg9nQ6aUMJ12LjWk9k%2FCp9P21J9tMECIgy6MJPoiy6RG2EyYcyFJdKerwhMjy2pODK3dALw1BNgUyTQIy3FZZK2Pe%2BOSYlK%2FaxutYWENYVO3AmEx%2BYbAIJ0aC3x4vWO6bwy3%2F%2FFAThdcKvIPquJOEDaDicatdLswcSZn%2F42Ptr%2FB94AMjmAJYgUVh1pZxrToZPRC3KthTgWqG1RQnb6wFgEe51YvfIqgnqzLAF93QCPLq1rBNwv63YtmxHKibWusm2WeqpraZq4UkQ3uzJuXdcvJMJEceQlKJNvrFEnp8pKGU9toWclAL%2FfoAd9m3yBtdzlqaBC4g1Zh4RDSI56Za8KACDDibnu%2BFrdd%2FFujkhWrPwdeVT3giyGxw3aMjePCs6IRlatLWMXotxILYzxJg%2BN1mHOBo4ZJ%2BcWNRiMKFGgjj3fp6ZAX5wEWWVzx51WpQhtCS9O7IdWCa0KK2N%2BSa5jLHOTfZLmWHgoIIM%2F2BlJZmWgMYEbkqRZPqfvd7T01B%2Fsr4P6uhypGL1S11tv3dJRrDIlWq4LWyk61REromkhiHyg7sLysiLiBS%2FF4TRBTBCnWhjngNU5UxgwOiZDpuq1zBYsus%2BcdWsrN080tbhSdAIyeB%2F642wOdqg17LyZtz7mWblMvaTYO9km9eJ8vyFXl02OxlMnXoJWceKSPcl8Ps2d9Vzht9qphiV2pFwqs7mSmoRfi%2FlKmnu%2Fs58A6XykVZBPtKqHB8ZPJ4XZiKCiSxsuieMgwq6rkvWVUyoXO%2FZT%2FX1yceRMQufZb22fx4V6RvjZ7Q5j2odiEeVbDjj8QGcJ%2FW6R%2BVJubuwxDm1Xi%2BjVkqmnSlb6Pd%2FLQJ3AasJQgrJA6tFE1sxym89bykdT6EPyRiWVzQNM7SfhyZlGLfyiQkV3Kg%2BOv%2F%2BPwpuz0RU1MUAeRqrcpa7zhP6WYz%2BgSJNHVnnjOapySW5AtJKWDxsh4fxDNHctIBcr7G1xNDrQbS1bqsnrJRmP21jlqpr3BZdvsu2W7FloaRWhof9hRX41VwbS30SWw%2FPJsFiRPTQQsjVop6V8etqUVwj13KCVxdfwF1qFbm%2BOIuWzg%2FbnJjYGGGbPkTv%2Fxid5U12H1P7%2B%2B7hy1%2BX0LHueNj%2FWLRcOe92Gu1iy7YfPVn%2Bwu1VfuFazrSiq5aMrM7XrO6QNagHdJGG%2BxlGOqxivu1Vi8j3QNDtfkFy3kJeKxbeyT2o1TdDlan58bDNtZpf%2F1CaX0G3dHYod1VEaQ%2B7hCAqyn1EB9menWGQAAWKaQSRoT8z7ddfYUTQr7CZqIy%2F4EFyl8kuYXvjfm5IZB7JI5n%2BsaRTqdrTbtCnyN3GDFtE9PIb95nzlz%2Bb5yg8iNj2jYyIMKNoEcBS8Zx5TYwFgs2vUZP48b46hjkBXkLxSh9cCbb6wg9oV4jw2Vff9Gsx594qoCogiPVnZH36IfvRFBtjYNU5vrqq7K8HEiI73X6xPVfVSth0pVbajzD9TfbqTnqvlnvUDrRVFzbS3s8vF4ypFrHTfkXeE%2BWbc7KeSNfGwrVMiH81agd39Xhz8nV%2FYzNqiKuTN6M2O12Jnnl05456VL5x0i7jdlGBv3ou479DmeeKyPwbuHzawEDGD7ffahpYhqfMokW1blt3PAPZBGD0Hg70xiE4S%2BSgYVIcmKU35jBtpXaKHCbpPG42laNzGEk%2B88lH1zY3FkzSCe07iCqFHcq9Y5kVpGjvf6C9bLTnlQs4PNalRQlaO2O91GIQIh1o2xGCtm14%2FablIHZb%2F2F%2B%2BQ6Ip6%2Beex56jT3gQmijH%2Bs5EevY4lsJtz4z1WpY8Pmm2k0QXTiCgnGRxYNbkI%2FvsU727bHHSiwUlHtWb4GkA%2F2QP3c0iM6kLgfI2FQG519rkefwgHKPF5%2FT5rrVqYaoiHHS7WY6vveQcs%2B4uzBGzu0%2FLp7X7%2B77QV8fdervPha3TF2qV5jLHWu7yx32Vr6Tvacd3eMJJqgAnUyB8IM5LqrpVcOnAgGJwF4N%2FELtVVNhDklXBIvGuMeM2DRlTBZTmBxLWGKDKq%2B%2B7TaW%2BuQs%2FPaT7TzbtXhYE2Pgm0DkwHlAzIRvOTpl%2BdEGtC7Vaat0moplx4gWtbYkq0PtS2qfqWp%2FX%2Bu%2FLVn%2FCcDtNxUm4kB%2FbiqMVBFR04j4KEu3ix7Sl7hMpJDfVQ3ZaUH2s%2BOCs6o5jJ0XKGEAMeuUIY8dz8BenTRHTJutFVriALx67ehOvNRnGDXVxC34Fz3hIsOI%2BtYK1cMRPfKUmPIjHyTB9LzMTHq%2B2ZNJVnPA8%2FhgidxhWUzcFzpUUk9Fc449pWVPPBg7xmuqMbU9BQZvCUEiBjDDNk%2BomqAujP0KneSS4Abu%2FX6yzcA7W11PTJsm10YBqmFWFKJEJ0uaCr3jMd%2B5H2DXj%2Be1rkacnkPuNpx%2BDBrTEMmB0Yb9%2FyEDg0YZtUF7uGLKjVnYv1DU7ybqnPRbaamoJ9nh%2ByVEIsiLyZ1MecdtNd%2Btisltv%2FWqqqQm7Alox9G4jxJaeGMXiSxEzwYK0Jr99UhaMM2NXSmlyqc303WxcfDUgZQ2aqN5mOJHNsMlaJ20ct0kXXxKsLGuCRI74Ph9NHctrDbu1QaFcWP6Vu1xaluMs5wN7EhB7alCGBKlVVoHY1%2FFu3j13spESacTNCDzVojKrBArG6N5jJFRwZrlZYDQDLwtyhuG8JEoTI3uVXMH4lTojRs7PGXiHlOwzYHC6A3mlxHcIR%2BZidH6KZoT0uvtvn7k9phjyoSiN6R7WHdIOdYXfgBKseLhRwwto2H09zkJGzFBLFqWkcled4xVcUvCWp6M0ELPg98P6Fnl%2BQAwkY0dq%2F34%2Bjwlx6poYpcqkwd0rMrPnsjciVMb3m6FS7%2FgKa2gGhFxRsXUceZOAhu8iCTpkRHx8yVYE6cGJhI0B8lNaW4aBuWSMqoQOWcpG4p4iIQmqaYq20%2FKSHeQEsExE93NULMcOgmxrJKO1KpLQCUQZ9IEpvLA2DjXakmos1VCHQYpdWZnc0dK201AwKnHt8iVPkfxufBprQBGbFHO1bvlMD0xmllt94ohcW8sJq3xXYapBFQEYaFfBldTZB4C6hB4zygTnbtqpy8rpSBFWgkeeWnU1e4BqO8z6koKrKJKxUcp50jEi5VyJut8Yk4XHo%2FDqaLd5aBZ2iFAoB0ea4SXmbWZ5Q8dPeH6OBBzPYzmY8jizgKX5IkKwCoh1IjAk9moj1EsnFYPyARs4m62uJwCc3Z9a3qiD820d7wnOLCEgtRGUyrSrbHhb1BHe32Dx8Ih%2FQ2%2Bm1vUe7MBEkCghQXs3GQjYND3rYUnhJeAEdcEu7DtxMVdxV%2F%2FEUwL0tFTUtgRjbxbiJK0BJQXmteqeJfsbrI2Omy%2BfHFGL4aFKIQwzU%2FUqPWEqb17BRifiBamPfXPpBXRs9W1XDEo3%2FxUgbCFhIOnyesvr%2FXwdDYXK4pYmqRQ1CRqXEK6LC26M09A%2BwjuFJ0JaTScmJDf3VzKz0ou3NS9IKWzXlFFoHccwT%2BSqFl5twHs81%2BgIF3RLC7dcV%2FpDP0nypahlh1h0rVYFCFr5Ma8%2Fbsa5qYP0ZLIxs7Cp9IgnRwVjCOXgyZ1OmRj8eR8Dm1VtN51Osf1OUhL2h25us5GPIdcfMOeGR7RkuBDqsiHtk5t3craUDSKjTs7Dx%2FGJsX97gfcVwP3O%2Be3l4v6vI2oPMxvl9ScyC%2FtsOPRt0xSLiZsSoMls6TNhWe9DjykPwE01zFp8TD71BnzGyC6OFdv9UQQttqSKvKSXK0mP4m%2B9AJpR7UMV7hAWl7ds%2FXhJkc7x9xEw7vl893NTXPYt97uH2%2FsN62EnOs9lp84cPWJfaKdR0Mem08nikm0WJ5D1YpP0KjFeMhi8doToYSeqC%2BhIPgij4V0Hd9kNqb96yyxwhPRdPKio7LJ7OQUlWRwVIsXgKtStS21QoGWlXKA5hVPWs%2Fvugfa5jat4AdHTvwYFeYnF2E2Y%2BxEi4UfZnKkT6k4IC9h49HpZPK5SRYST46bJKv3taXBE8fnJ39mLf%2FCBtEqM4br4odCZjOG1Ok2x%2BEL1xebF7c6UXtogjG0%2Bq0jMwZpdFz6EGBqt66tvKJZtouN4F0CQLs9MRmmIxHbZF6y1ubQLAa6E6xRwZLjh2GlhZwgiT%2BtFEUuMP7YMguVHVjtePUfSuBk%2FY64NXTSWRjSEOlma3dOJj%2FpKTsXZ328WSeZhhFbuhywsVAeALFs3aI5YMIe%2B%2FDHjYdn8n0m8Uz6WsklLWno5gYjTFBSBYJvkjuiPGFCk2yJJSRMyAlJkxBSAQFiq1CbfFL%2Bc2NtpNVctHzXN7bGzvPRdb8SjedlFvEufBh4Re3pzbaSIKnd%2FJ5Sn83u7vU4ylimzOnR3fZUJjvM6KSorN%2BLU9m%2BvTbFQgx3r2H1DspHCkfnSGlnLcVKgdssSLDCwSwHP8tg94NLKudMPlQNi52wHtpUD29CZcUCJkgXdZZV4vL69DDeENaxkBYj2KR%2FZsx9gCO7a1D1Jn6Awdq6BMIgsizByiYHg4sfOrwl%2BBcd46%2FChx5k02HJZuFCOltKtZCso%2BJmYVU7slk4u%2Fpvjm0zrTZ7dKCbVS4QrIwJJh1DBV3kg8j8ybOpa1pzSP%2BTcrx4EYNOchMUQ%2BniKdqk06sO%2FKtl5HNPPWSYBNmJZkJnUD6Sst8pFJPkrHiGDGoVUGpCzJ5SDjklaybw0BmxbKjEBKB223vab3OMSVm0k2kUom8trGSLtcqLQ8%2Bkx7D2ozR3H7xe8aReK5WSFesMsww%2FVsKMkCNBlx%2BI9R3yevg7OoEAnF20k5z38ibGjkFI%2B%2FMLjYO9nXL65bxMGvPGQ3rQF1ZYIyG11bBCT5MFEYtoaiRZUQH2QiDSKnErj2QOOygwo%2FiJh%2FEDH9KYKvgF6bTXsKbdKz%2BwHWjnug%2B78Y5WU%2BQd%2FXRsr9QWpm7hUoNQM8cJ4todmdbsqwPBb82r%2FwM%3D). In case of paired end reads, corresponding fastq files should be named using *.R1.fastq.gz* and *.R2.fastq.gz* suffixes. Specify the desired analysis details for your data in the *essential.vars.groovy* file (see below) and run the pipeline *dnaseq.pipeline.groovy* as described [here](https://gitlab.rlp.net/imbforge/NGSpipe2go/-/blob/master/README.md). A markdown file *variantreport.Rmd* will be generated in the output reports folder after running the pipeline. Subsequently, the *variantreport.Rmd* file can be converted to a final html report using the *knitr* R-package. GATK requires chromosomes in bam files to be karyotypically ordered. Best you use an ordered genome fasta file as reference for the pipeline (assigned in *essential.vars.groovy*, see below). ### The pipelines includes - quality control of rawdata with FastQC - Read mapping to the reference genome using BWA - identify and remove duplicate reads with Picard MarkDuplicates - Realign BAM files at Indel positions using GATK - Recalibrate Base Qualities in BAM files using GATK - Variant calling using GATK UnifiedGenotyper and GATK HaplotypeCaller - Calculate VQSLOD scores for further filtering variants using GATK VariantRecalibrator and ApplyRecalibration - Calculate the basic properties of variants as triplets for "all", "known" ,"novel" variants in comparison to dbSNP using GATK VariantEval ### Pipeline parameter settings - essential.vars.groovy: essential parameter describing the experiment including: - ESSENTIAL_PROJECT: your project folder name - ESSENTIAL_BWA_REF: path to BWA indexed reference genome - ESSENTIAL_CALL_REGION: bath to bed file containing region s to limit variant calling to (optional) - ESSENTIAL_PAIRED: either paired end ("yes") or single end ("no") design - ESSENTIAL_KNOWN_VARIANTS: dbSNP from GATK resource bundle (crucial for BaseQualityRecalibration step) - ESSENTIAL_HAPMAP_VARIANTS: variants provided by the GATK bundle (essential for Variant Score Recalibration) - ESSENTIAL_OMNI_VARIANTS: variants provided by the GATK bundle (essential for Variant Score Recalibration) - ESSENTIAL_MILLS_VARIANTS: variants provided by the GATK bundle (essential for Variant Score Recalibration) - ESSENTIAL_THOUSAND_GENOMES_VARIANTS: variants provided by the GATK bundle (essential for Variant Score Recalibration) - ESSENTIAL_THREADS: number of threads for parallel tasks - additional (more specialized) parameter can be given in the var.groovy-files of the individual pipeline modules ## Programs required - Bedtools - BWA - FastQC - GATK - Picard - Samtools """ ; ns1:keywords "DNA-seq, GATK3, bpipe, groovy" ; ns1:license ; ns1:name "DNA-seq" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Bpipe" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-19T23:56:34Z"^^ns1:Date ; ns1:dateModified "2023-10-19T23:56:34Z"^^ns1:Date ; ns1:description """## Summary HPPIDiscovery is a scientific workflow to augment, predict and perform an insilico curation of host-pathogen Protein-Protein Interactions (PPIs) using graph theory to build new candidate ppis and machine learning to predict and evaluate them by combining multiple PPI detection methods of proteins according to three categories: structural, based on primary aminoacid sequence and functional annotations.
HPPIDiscovery contains three main steps: (i) acquirement of pathogen and host proteins information from seed ppis provided by HPIDB search methods, (ii) Model training and generation of new candidate ppis from HPIDB seed proteins' partners, and (iii) Evaluation of new candidate ppis and results exportation. (i) The first step acquires the identification of the taxonomy ids of the host and pathogen organisms in the result files. Then it proceeds parsing and cleaning the HPIDB results and downloading the protein interactions of the found organisms from the STRING database. The string protein identifiers are also mapped using the id mapping tool of uniprot API and we retrieve the uniprot entry ids along with the functional annotations, sequence, domain and kegg enzymes. (ii) The second step builds the training dataset using the non redundant hpidb validated interactions of each genome as positive set and random string low confidence ppis from each genome as negative set. Then, PredPrin tool is executed in the training mode to obtain the model that will evaluate the new candidate PPIs. The new ppis are then generated by performing a pairwise combination of string partners of host and pathogen hpidb proteins. Finally, (iii) in the third step, the predprin tool is used in the test mode to evaluate the new ppis and generate the reports and list of positively predicted ppis. The figure below illustrates the steps of this workflow. ## Requirements: * Edit the configuration file (config.yaml) according to your own data, filling out the following fields: - base_data: location of the organism folders directory, example: /home/user/data/genomes - parameters_file: Since this workflow may perform parallel processing of multiple organisms at the same time, you must prepate a tabulated file containng the genome folder names located in base data, where the hpidb files are located. Example: /home/user/data/params.tsv. It must have the following columns: genome (folder name), hpidb_seed_network (the result exported by one of the search methods available in hpidb database), hpidb_search_method (the type of search used to generate the results) and target_taxon (the target taxon id). The column hpidb_source may have two values: keyword or homology. In the keyword mode, you provide a taxonomy, protein name, publication id or detection method and you save all results (mitab.zip) in the genome folder. Finally, in the homology mode allows the user to search for host pathogen ppis giving as input fasta sequences of a set of proteins of the target pathgen for enrichment (so you have to select the search for a pathogen set) and you save the zip folder results (interaction data) in the genome folder. This option is extremely useful when you are not sure that your organism has validated protein interactions, then it finds validated interactions from the closest proteins in the database. In case of using the homology mode, the identifiers of the pathogens' query fasta sequences must be a Uniprot ID. All the query protein IDs must belong to the same target organism (taxon id). - model_file: path of a previously trained model in joblib format (if you want to train from the known validated PPIs given as seeds, just put a 'None' value) ## Usage Instructions The steps below consider the creation of a sqlite database file with all he tasks events which can be used after to retrieve the execution time taken by the tasks. It is possible run locally too (see luigi's documentation to change the running command).

* Preparation: 1. ````git clone https://github.com/YasCoMa/hppidiscovery.git```` 2. ````cd hppidiscovery```` 3. ````mkdir luigi_log```` 4. ````luigid --background --logdir luigi_log```` (start luigi server) 5. conda env create -f hp_ppi_augmentation.yml 6. conda activate hp_ppi_augmentation 6.1. (execute ````pip3 install wget```` (it is not installed in the environment)) 7. run ````pwd```` command and get the full path 8. Substitute in config_example.yaml with the full path obtained in the previous step 9. Download SPRINT pre-computed similarities in https://www.csd.uwo.ca/~ilie/SPRINT/precomputed_similarities.zip and unzip it inside workflow_hpAugmentation/predprin/core/sprint/HSP/ 10. ````cd workflow_hpAugmentation/predprin/```` 11. Uncompress annotation_data.zip 12. Uncompress sequence_data.zip 13. ````cd ../../```` 14. ````cd workflow_hpAugmentation```` 15. snake -n (check the plan of jobs, it should return no errors and exceptions) 16. snakemake -j 4 (change this number according the number of genomes to analyse and the amount of cores available in your machine)""" ; ns1:image ; ns1:keywords "Bioinformatics, Protein-Protein interaction prediction, host-pathogen PPIs, proteins network augmentation" ; ns1:license ; ns1:name "HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bits for Hifiasm bloom filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database for Busco Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope Model Parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Homozygous Read Coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Stats on Alternate Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Stats on Primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Alternate Assembly contigs: Full table " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Alternate Assembly contigs: Full table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Alternate Assembly contigs: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Alternate Assembly contigs: Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Alternate Assembly contigs: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Primary Assembly contigs: Full table " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Primary Assembly contigs: Full table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Primary Assembly contigs: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Primary Assembly contigs: Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Primary Assembly contigs: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated Genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Alternate gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Primary gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury Histograms" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury completeness stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury png" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury spectra plots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nx and Size plots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Usable GFA Alternate" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Usable GFA Alternate no sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Usable GFA Primary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Usable GFA Primary no sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clean_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cutadapt multiqc stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_qv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc html report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_01.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw unitig graph image" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-15T03:01:42Z"^^ns1:Date ; ns1:dateModified "2026-03-15T03:01:44Z"^^ns1:Date ; ns1:description "Generate a genome assembly based on PacBio HiFi reads. Part of the VGP suite, it needs to be run after the VGP1 k-mer profiling workflow. The assembly contigs are built using HiFiasm, and the workflow generates assembly statistics, BUSCO reports, Merqury plots, and the contigs in fasta and GFA formats." ; ns1:input , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Assembly-Hifi-only-VGP3/main" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 26 . a ns1:Person ; ns1:name "Matthias Bernt" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Centroided LC-MS datasets" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Fasta Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Fixed modifications" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Labeled element" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Precursor monoisotopic mass tolerance (ppm)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variable modifications" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Feature fitting result" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Peptide centric result" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-04-14T02:01:47Z"^^ns1:Date ; ns1:dateModified "2026-04-14T02:01:47Z"^^ns1:Date ; ns1:description "Automated inference of stable isotope incorporation rates in proteins for functional metaproteomics " ; ns1:input , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "openms-metaprosip/main" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-10-20T11:59:54Z"^^ns1:Date ; ns1:dateModified "2025-07-24T12:30:31Z"^^ns1:Date ; ns1:description "A demonstration workflow for Reduced Order Modeling (ROM) within the eFlows4HPC project, implemented using Kratos Multiphysics, EZyRB, COMPSs, and dislib." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.614.1" ; ns1:image ; ns1:keywords "Reduced Order Modeling, PyCOMPSs, Kratos Multiphysics, EZyRB, dislib, Supercomputer, data_persistence, Nord3, COMPSs Use Cases" ; ns1:license ; ns1:name "eFlows4HPC Demo ROM Workflow" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Gene name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum number of Gaussians to study" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum value in logNorm" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Tabular with raw expression values" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "baredsc_neff" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "baredsc_numpy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "baredsc_qc_plots" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_other_outputs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_pdf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "combined_plot" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-03-26T09:52:15Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:36Z"^^ns1:Date ; ns1:description "Run baredSC in 1 dimension in logNorm for 1 to N gaussians and combine models." ; ns1:input , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "baredsc/baredSC-1d-logNorm" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-21T23:35:16Z"^^ns1:Date ; ns1:dateModified "2023-10-21T23:37:59Z"^^ns1:Date ; ns1:description """## Summary PredPrIn is a scientific workflow to predict Protein-Protein Interactions (PPIs) using machine learning to combine multiple PPI detection methods of proteins according to three categories: structural, based on primary aminoacid sequence and functional annotations.
PredPrIn contains three main steps: (i) acquirement and treatment of protein information, (ii) feature generation, and (iii) classification and analysis. (i) The first step builds a knowledge base with the available annotations of proteins and reuses this base for other prediction experiments, saving time and becoming more efficient. (ii) The feature generation step involves several evidence from different classes, such as: Gene Ontology (GO) information, domain interaction, metabolic pathway participation and sequence-based interaction. For the GO branches, we made a study to evaluate the best method to calculate semantic similarity to enhance the workflow performance. This step can be easily modified by adding new metrics, making PredPrIn flexible for future improvements. Finally, (iii) in the third step, the adaboost classifier is responsible for predicting the final scores from the numerical features dataset, exporting results of performance evaluation metrics. ## Requirements: * Python packages needed: - pip3 install luigi - pip3 install sqlalchemy - pip3 install rdflib - pip3 install sklearn - pip3 install matplotlib - pip3 install numpy * Other instalation: - sqlite (to be able to see the documentation generated by luigi about the tasks after execution) ## Usage Instructions The steps below consider the creation of a sqlite database file with all he tasks events which can be used after to retrieve the execution time taken by the tasks. It is possible run locally too (see luigi's documentation to change the running command).

* Preparation: 1. ````git clone https://github.com/YasCoMa/predprin.git```` 2. ````cd PredPrIn```` 3. `pip3 install -r requirements.txt` 4. Download annotation_data.zip (https://drive.google.com/file/d/1bWPSyULaooj7GTrDf6QBY3ZyeyH5MRpm/view?usp=share_link) 5. Download rdf_data.zip (https://drive.google.com/file/d/1Cp511ioXiw2PiOHdkxa4XsZnxOeM3Pan/view?usp=share_link) 6. Download sequence_data.zip (https://drive.google.com/file/d/1uEKh5EF9X_6fgZ9cTTp0jW3XaL48stxA/view?usp=share_link) 7. Unzip annotation_data.zip 8. Unzip rdf_data.zip 9. Unzip sequence_data.zip 10. Download SPRINT pre-computed similarities in https://www.csd.uwo.ca/~ilie/SPRINT/precomputed_similarities.zip and unzip it inside core/sprint/HSP/ 11. Certify that there is a file named client.cfg (to configure the history log and feed the sqlite database). It must have the following data: ```` [core] default-scheduler-host=localhost default-scheduler-port=8082 rpc-connect-timeout=60.0 rpc-retry-attempts=10 rpc-retry-wait=60 [scheduler] record_task_history = True [task_history] db_connection = sqlite:///luigi-task-hist.db ```` * Parameters: 1. parameters-file -> json file with all the information to process the prediction experiment (example: params.json) 2. mode -> it can have two values: train (executes cross validation and save the model as a .joblib file) or test (uses a model obtained in train mode to test in some dataset listed in the parameters file) 3. model -> it is the model file full path saved in train mode as .joblib * Running: 1. ````mkdir luigi_log```` (or other name for the log folder of your choice) 2. ````luigid --background --logdir luigi_log```` (start luigi server) 3. ````nohup python3.5 -m luigi --module main RunPPIExperiment --parameters-file params.json --mode 'train' --model none.joblib --workers 3 &````
````nohup python3.5 -m luigi --module main RunPPIExperiment --parameters-file params.json --mode 'test' --model model.jolib --workers 3 &````
- Replace python3.5 by the command python of your environment
- Replace the data given as example in params.json using your own data
- Adapt the number of workers to use as you need and the capacity of your computational resource available You can monitor the prediction experiment execution in localhost:8082 ## Reference Martins YC, Ziviani A, Nicolás MF, de Vasconcelos AT. Large-Scale Protein Interactions Prediction by Multiple Evidence Analysis Associated With an In-Silico Curation Strategy. Frontiers in Bioinformatics. 2021:38. https://www.frontiersin.org/articles/10.3389/fbinf.2021.731345/full ## Bug Report Please, use the [Issues](https://github.com/YasCoMa/PredPrIn/issues) tab to report any bug.""" ; ns1:image ; ns1:keywords "Bioinformatics, Luigi & Rufus workflow, Pathway co-occurrence, Gene ontology term sets similarity, Domain-Domain interaction" ; ns1:license ; ns1:name "PredPrIn - Scientific workflow to predict protein-protein interactions based in a combined analysis of multiple protein characteristics." ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-21T23:43:48Z"^^ns1:Date ; ns1:dateModified "2023-10-21T23:45:17Z"^^ns1:Date ; ns1:description """## Summary The validation process proposed has two pipelines for filtering PPIs predicted by some _IN SILICO_ detection method, both pipelines can be executed separately. The first pipeline (i) filter according to association rules of cellular locations extracted from HINT database. The second pipeline (ii) filter according to scientific papers where both proteins in the PPIs appear in interaction context in the sentences. The pipeline (i) starts extracting cellular component annotations from HINT PPIs building a dataset and then the Apriori algorithm is applied in this dataset in an iterative process that repeat the application of this algorithm till the rules cover 15 main locations in the cell. This process generate a database with association rules with two main columns: antecedent and consequent, meaning that a location that occurs in antecedent also occurs with the location in consequent. The filtering task evaluate the PPI checking if some location annotated for the first protein is in the antecedent column and if some location of the second protein is also in the same rule but in the consequent column. If so, the PPI passes according to the criteria. The pipeline (ii) starts getting all papers that mention both proteins in the PPIs and extrating their content using the NCBI [API](https://www.ncbi.nlm.nih.gov/home/develop/api/). These XML files are cleaned removing hypertext markup and references to figures, tables and supplementary materials. The paragraphs of the remaining articles content are processed by Natural language processing steps to extract sentences, tokens, stopwords removal to remove words extremely common in english language and do not help to identify the context of interest, prioritizing tokens using part-of-speech tagging to keep just nouns and verbs. Then the sentences filtered goes to the task that identifies the proteins of the PPI in evaluation among the tokens and also tries to identify tokens or set of tokens that mention experimental methods. The sentences that have the proteins of interest are filtered if the nouns and verbs have some of the items of the list of words indicating interaction relation (recruit, bind, interact, signaling, etc). Finally, a report is made by pair with the article identifiers, the sentences, the proteins and interacting words found. The figure below illustrates all the tasks of these pipelines.
pipeline
## Requirements: * Python packages needed: - pip3 install pandas - pip3 install rdflib - pip3 install mlxtend - pip3 install inflect - pip3 install nltk - pip3 install biopython - pip3 install lxml - pip3 install bs4 (beautiful soup) ## Usage Instructions ### Preparation: 1. ````git clone https://github.com/YasCoMa/ppi_validation_process.git```` 2. `pip3 install -r requirements.txt` 3. ````cd ppi_validation_process/pipe_location_assocRules/```` 4. ````unzip pygosemsim.zip```` 5. ````cd ../```` ### Filtering by association rules of cellular locations (first filtering part) - File ````pipe_location_assocRules/find_pattern.py```` : * Pipeline parameters: - __-fo__ or __--folder__
Folder to store the files (use the folder where the other required file can be found) - __-if__ or __--interactome_file__
File with the pairs (two columns with uniprot identifiers in tsv format)
Example of this file: pipe_location_assocRules/running_example/all_pairs.tsv * Running modes examples: 1. Go to the first filtering part folder:
````cd pipe_location_assocRules/```` 2. Uncompress annotation_data.zip 3. Run:
````python3 find_pattern.py -fo running_example/ -if all_pairs.tsv```` ### Filtering by text mining on scientific papers (second filtering part) - File ````ppi_pubminer/pubmed_pmc_literature_pipeline.py````: * Pipeline parameters: - __-em__ or __--execution_mode__
Use to indicate the execution mode desired:
1 - Mode using a list of protein pairs as bait
2 - Mode that tries to find sentences of PPI context for any protein pairs given a list of articles - __-fo__ or __--folder__
Folder to store the files (use the folder where the other required file can be found) - __-rtm1__ or __--running_type_mode_1__
Use to indicate which execution step you want to run for mode 1 (it is desirable following the order showed):
0 (default) - Run all steps
1 - Run step 1 (Get mentions of both proteins in PMC articles)
2 - Run step 2 (Get the PMC or Pubmed files, clean and store them)
3 - Run step 3 (Get the exact sentences where the proteins were found on interacting context) - __-rtm2__ or __--running_type_mode_2__
Use to indicate which execution step you want to run for mode 2 (it is desirable following the order showed):
0 (default) - Run all steps
1 - Run step 1 (Get the PMC or Pubmed files from the given list, clean and store them)
2 - Run step 2 (Get the exact sentences where the proteins were found on an interacting context) - __-fp__ or __--file_pairs__
(For mode 1) File with the pairs (two columns with uniprot identifiers in tsv format)
Example of this file: ppipubminer/running_example/mode_1/all_pairs.tsv - __-fe__ or __--file_evaluation__
(For mode 1) File exported after step 1 execution in tsv format
- __-fa__ or __--file_articles__
(For mode 2) File with the articles (First column indicating if it is from pmc or pubmed and the second one is the article id) in tsv format)
Example of this file: ppipubminer/running_example/mode_2/articles_info.tsv * Running modes examples: - Go to the second filtering part folder:
````cd ppipubminer/```` - Mode 1 - From protein pairs (PPIs) to sentences in articles 1. Running all three steps of mode 1:
````python3 pubmed_pmc_literature_pipeline.py -em 1 -rtm1 0 -fo running_example/mode_1/ -fp all_pairs.tsv```` 2. Running only step 1 of mode 1:
````python3 pubmed_pmc_literature_pipeline.py -em 1 -rtm1 1 -fo running_example/mode_1/ -fp all_pairs.tsv```` 3. Running only step 2 of mode 1:
````python3 pubmed_pmc_literature_pipeline.py -em 1 -rtm1 2 -fo running_example/mode_1/ -fp all_pairs.tsv -fe literature_evaluation_pairs.tsv```` 4. Running only step 3 of mode 1:
````python3 pubmed_pmc_literature_pipeline.py -em 1 -rtm1 3 -fo running_example/mode_1/ -fp all_pairs.tsv -fe literature_evaluation_pairs.tsv```` - Mode 2 - From articles to report of sentences with any protein pairs (PPIs) 1. Running all three steps of mode 2:
````python3 pubmed_pmc_literature_pipeline.py -em 2 -rtm1 0 -fo running_example/mode_2/ -fa articles_info.tsv```` 2. Running only step 1 of mode 2:
````python3 pubmed_pmc_literature_pipeline.py -em 2 -rtm1 1 -fo running_example/mode_2/ -fa articles_info.tsv```` 3. Running only step 2 of mode 2:
````python3 pubmed_pmc_literature_pipeline.py -em 2 -rtm1 2 -fo running_example/mode_2/ -fa articles_info.tsv ```` ## Reference Martins YC, Ziviani A, Nicolás MF, de Vasconcelos AT. Large-Scale Protein Interactions Prediction by Multiple Evidence Analysis Associated With an In-Silico Curation Strategy. Frontiers in Bioinformatics. 2021:38. https://www.frontiersin.org/articles/10.3389/fbinf.2021.731345/full ## Bug Report Please, use the [Issues](https://github.com/YasCoMa/ppi_validation_process/issues) tab to report any bug.""" ; ns1:image ; ns1:keywords "Bioinformatics, scientific publication text mining, validaiton o protein interaction predictions" ; ns1:license ; ns1:name "PPIVPro - PPI Validation Process" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-21T23:56:39Z"^^ns1:Date ; ns1:dateModified "2023-10-21T23:56:39Z"^^ns1:Date ; ns1:description """## Summary This pipeline has as major goal provide a tool for protein interactions (PPI) prediction data formalization and standardization using the [OntoPPI](https://link.springer.com/chapter/10.1007/978-3-030-36599-8_23) ontology. This pipeline is splitted in two parts: (i) a part to prepare data from three main sources of PPI data ([HINT](http://hint.yulab.org/), [STRING](https://string-db.org/) and [PredPrin](https://github.com/YasCoMa/PredPrin.git)) and create the standard files to be processed by the next part; (ii) the second part uses the data prepared before to semantically describe using ontologies related to the concepts of this domain. It describes the provenance information of PPI prediction experiments, datasets characteristics, functional annotations of proteins involved in the PPIs, description of the PPI detection methods (also named as evidence) used in the experiment, and the prediction score obtained by each PPI detection method for the PPIs. This pipeline also execute data fusion to map the same protein pairs from different data sources and, finally, it creates a database of all these information in the [alegro](https://allegrograph.com/) graph triplestore. ## Requirements: * Python packages needed: - pip3 install numpy - pip3 install rdflib - pip3 install uuid - pip3 install SPARQLWrapper - alegro graph tools (pip3 install agraph-python)
Go to this [site](https://franz.com/agraph/support/documentation/current/python/install.html) for the installation tutorial ## Usage Instructions ### Preparation: 1. ````git clone https://github.com/YasCoMa/ppintegrator.git```` 2. ````cd ppintegrator```` 3. `pip3 install -r requirements.txt` **Allegrograph is a triple store, which is a database to maintain semantic descriptions. This database's server provides a web application with a user interface to run, edit and manage queries, visualize results and manipulate the data without writing codes other than SPARQL query language. The use of the Allegregraph option is not mandatory, but if you want to export and use it, you have to install the server and the client.** 4. if you want to use the Allegrograph server option (this triple store has free license up to 5,000,000 triples), install allegrograph server in your machine (configure a user and password): Server - https://franz.com/agraph/support/documentation/current/server-installation.html; Client - https://franz.com/agraph/support/documentation/current/python/install.html 5. Export the following environment variables to configure Allegrograph server ```` export AGRAPH_HOST=127.0.0.1 export AGRAPH_PORT=10035 export AGRAPH_USER=chosen_user export AGRAPH_PASSWORD=chosen_password ```` 5. Start allegrograph: ````path/to/allegrograph/bin/agraph-control --config path/to/allegrograph/lib/agraph.cfg start```` 6. Read the file data_requirements.txt to understand which files are needed for the process ### Data preparation (first part) - File ````prepare_data_triplification.py```` : * Pipeline parameters: - __-rt__ or __--running_type__
Use to indicate from which source you want to prepare PPI data, as follows:
1 - Prepare data for PredPrin
2 - Prepare data for String
3 - Prepare data for HINT - __-fec__ or __--file_experiment_config__
File with the experiment configuration in json format
Examples are in these files (all the metadata are required): params_hint.json, params_predrep_5k.json e params_string.json - __-org__ or __--organism__
Prepare data only for one organism of interest (example: homo_sapiens)
This parameter is optional. If you do not specify, it will automatically use the organisms described in the experiment configuration file above * Running modes examples: 1. Running for PPI data generated by PredPrin:
````python3 prepare_data_triplification.py -rt 1 -fec params_predrep_5k.json```` 2. Running for HINT database:
````python3 prepare_data_triplification.py -rt 3 -fec params_hint.json```` 3. Running for STRING database:
````python3 prepare_data_triplification.py -rt 2 -fec params_string.json```` In the file ````auxiliar_data_preparation.py```` you can run it for all the examples provided automatically, as follows:
````python3 auxiliar_data_preparation.py```` ### PPI data triplification (second part) - File ````triplification_ppi_data.py````: * Pipeline parameters: - __-rt__ or __--running_type__
Use to indicate which execution step you want to run (it is desirable following the order showed):
0 - Generate the descriptions for all the protein interaction steps of an experiment (run steps 1, 2 and 3)
1 - Generate triples just about data provenance
2 - Generate triples just for protein functional annotations
3 - Generate triples just for the score results of each evidence
4 - Execute data fusion
5 - Generate descriptions and execute data fusion (run steps 1, 2, 3 and 4)
6 - Export to allegrograph server - __-fec__ or __--file_experiment_config__
File with the experiment configuration in json format
Examples are in these files (all the metadata are required): params_hint.json, params_predrep_5k.json e params_string.json - __-fev__ or __--file_evidence_info__
File with the PPI detection methods information in json format
Examples are in these files (all the metadata are required): evidences_information.json, evidences_information_hint.json e evidences_information_string.json - __-fcv__ or __--file_config_evidence__
File with the experiment and evidence methods files addresses in tsv format
Example of this file: config_evidence_file.tsv * Running modes examples: 1. Running to generate all semantic descriptions for PredPrin:
````python3 triplification_ppi_data.py -rt 0 -fec params_predrep_5k.json -fev evidences_information.json```` 2. Running to generate only triples of data provenance:
````python3 triplification_ppi_data.py -rt 1 -fec params_hint.json -fev evidences_information_hint.json```` 3. Running to generate only triples of PPI scores for each evidence:
````python3 triplification_ppi_data.py -rt 3 -fec params_hint.json -fev evidences_information_hint.json```` 4. Running to generate only triples of protein functional annotations (only PredPrin exports these annotations):
````python3 triplification_ppi_data.py -rt 2 -fec params_predrep_5k.json -fev evidences_information.json```` 5. Running to generate all semantic descrptions for STRING:
````python3 triplification_ppi_data.py -rt 0 -fec params_string.json -fev evidences_information_string.json```` **For the next options (4, 5 and 6), it is mandatory running at least mode 1 and 3 for HINT, STRING and PredPrin** 6. Running to execute data fusion of different sources:
````python3 triplification_ppi_data.py -rt 4 -fcv config_evidence_file.tsv```` 7. Running to generate all semantic descriptions and execute data fusion of different sources (combines mode 0 and 4):
````python3 triplification_ppi_data.py -rt 5 -fcv config_evidence_file.tsv```` 8. Export semantic data to allegrograph server:
````python3 triplification_ppi_data.py -rt 6 -fcv config_evidence_file.tsv```` ## Query Scenarios for analysis Supposing you ran all the steps showed in the section above, you can run the following options to analyse the data stored alegro graph triple store.
File to use for this section: ````query_analysis_ppitriplificator.py````
* Parameter: - __-q__ or __--query_option__
Use to indicate which query you want to perform:
1 - Get all the different organisms whose interactions are stored in the database
2 - Get the interactions that have scientific papers associated and the list of these papers
3 - Get a list of the most frequent biological processes annotated for the interactions of Escherichia coli bacteria
4 - Get only the interactions belonging to a specific biological process (regulation of transcription, DNA-templated) in Escherichia coli bacteria
5 - Get the scores of interactions belonging to a specific biological process (regulation of transcription, DNA-templated) in Escherichia coli bacteria
6 - Get a list of the most frequent biological processes annotated for the interactions of human organism
7 - Get only the interactions belonging to a specific biological process (positive regulation of transcription by RNA polymerase II) in human organism
8 - Get the scores of interactions belonging to a specific biological process (positive regulation of transcription by RNA polymerase II) in human organism * Running modes examples: 1. Running queries:
````python3 query_analysis_ppitriplificator.py -q 1 ````
Change number 1 to the respective number of the query you want to perform ## Reference Martins, Y. C., Ziviani, A., Cerqueira e Costa, M. D. O., Cavalcanti, M. C. R., Nicolás, M. F., & de Vasconcelos, A. T. R. (2023). PPIntegrator: semantic integrative system for protein–protein interaction and application for host–pathogen datasets. Bioinformatics Advances, 3(1), vbad067. ## Bug Report Please, use the [Issues](https://github.com/YasCoMa/ppintegrator/issues) tab to report any bug.""" ; ns1:image ; ns1:keywords "protein interactin data triplification, protein interactions database integration, data fusion, data annotation" ; ns1:license ; ns1:name "PPIntegrator - PPI Triplification Process" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-22T00:02:52Z"^^ns1:Date ; ns1:dateModified "2023-10-22T00:02:52Z"^^ns1:Date ; ns1:description """## Summary The PPI information aggregation pipeline starts getting all the datasets in [GEO](https://www.ncbi.nlm.nih.gov/geo/) database whose material was generated using expression profiling by high throughput sequencing. From each database identifiers, it extracts the supplementary files that had the counts table. Once finishing the download step, it identifies those that were normalized or had the raw counts to normalize. It also identify and map the gene ids to uniprot (the ids found usually were from HGNC and Ensembl). For each normalized counts table belonging to some experiment, il filters those which have the proteins (already mapped from HGNC to Uniprot identifiers) in the pairs in evaluation. Then, it calculates the correlation matrix based on Pearson method in the tables and saves the respective pairs correlation value for each table. Finally, a repor is made for each pair in descending order of correlation value with the experiment identifiers. ## Requirements: * Python packages needed: - os - scipy - pandas - sklearn - Bio python - numpy ## Usage Instructions * Preparation: 1. ````git clone https://github.com/YasCoMa/PipeAggregationInfo.git```` 2. ````cd PipeAggregationInfo```` 3. ````pip3 install -r requirements.txt```` ### Preprocessing pipeline * Go to the ncbi [GDS database webpage](https://www.ncbi.nlm.nih.gov/gds), use the key words to filter your gds datasets of interest and save the results as file ("Send to" option), and choose "Summary (text)" * Alternatively, we already saved the results concerning protein interactions, you may use them to run preprocessing in order to obtain the necessary files for the main pipeline * Running preprocessing: - ````cd preprocessing```` - ````python3 data_preprocessing.py ./workdir_preprocessing filter_files```` - ````cd ../```` - Copy the generated output folder "data_matrices_count" into the workflow folder: ````cp -R preprocessing/workdir_preprocessing/data_matrices_count .```` ### Main pipeline * Pipeline parameters: - __-rt__ or __--running_type__
Use to indicate the step you want to execute (it is desirable following the order):
1 - Make the process of finding the experiments and ranking them by correlation
2 - Select pairs that were already processed and ranked making a separated folder of interest - __-fo__ or __--folder__
Folder to store the files (use the folder where the other required file can be found) - __-if__ or __--interactome_file__
File with the pairs (two columns with uniprot identifiers in tsv format)
Example of this file: running_example/all_pairs.tsv - __-spf__ or __--selected_pairs_file__
File with PPIs of interest (two columns with uniprot identifiers in tsv format)
Example of this file: running_example/selected_pairs.tsv * Running modes examples: 1. Run step 1:
````python3 pipeline_expression_pattern.py -rt 1 -fo running_example/ -if all_pairs.tsv ```` 2. Run step 2:
````python3 pipeline_expression_pattern.py -rt 2 -fo running_example/ -spf selected_pairs.tsv ```` ## Bug Report Please, use the [Issue](https://github.com/YasCoMa/PipeAggregationInfo/issues) tab to report any bug.""" ; ns1:image ; ns1:keywords "Bioinformatics, gene expression correlation, gene expression data wrangling, geo database mining" ; ns1:license ; ns1:name "PipePatExp - Pipeline to aggregate gene expression correlation information for PPI" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-10-07T07:46:11Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:52Z"^^ns1:Date ; ns1:description """# scRNA-Seq pipelines Here we forge the tools to analyze single cell RNA-Seq experiments. The analysis workflow is based on the Bioconductor packages [*scater*](https://bioconductor.org/packages/devel/bioc/vignettes/scater/inst/doc/overview.html) and [*scran*](https://bioconductor.org/packages/devel/bioc/vignettes/scran/inst/doc/scran.html) as well as the Bioconductor workflows by Lun ATL, McCarthy DJ, & Marioni JC [*A step-by-step workflow for low-level analysis of single-cell RNA-seq data.*](http://doi.org/10.12688/f1000research.9501.1) F1000Res. 2016 Aug 31 [revised 2016 Oct 31];5:2122 and Amezquita RA, Lun ATL et al. [*Orchestrating Single-Cell Analysis with Bioconductor*](https://osca.bioconductor.org/index.html) Nat Methods. 2020 Feb;17(2):137-145. ## Implemented protocols - MARS-Seq (massively parallel single-cell RNA-sequencing): The protocol is based on the publications of Jaitin DA, et al. (2014). *Massively parallel single-cell RNA-seq for marker-free decomposition of tissues into cell types.* Science (New York, N.Y.), 343(6172), 776–779. https://doi.org/10.1126/science.1247651 and Keren-Shaul H., et al. (2019). *MARS-seq2.0: an experimental and analytical pipeline for indexed sorting combined with single-cell RNA sequencing.* Nature Protocols. https://doi.org/10.1038/s41596-019-0164-4. The MARS-Seq library preparation protocol is given [here](https://github.com/imbforge/NGSpipe2go/blob/master/resources/MARS-Seq_protocol_Step-by-Step_MML.pdf). The sequencing reads are demultiplexed according to the respective pool barcodes before they are used as input for the analysis pipeline. - Smart-seq2: Libraries are generated using the [Smart-seq2 kit](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2639.html). ## Pipeline Workflow All analysis steps are illustrated in the pipeline [flowchart](https://www.draw.io/?lightbox=1&highlight=0000ff&edit=_blank&layers=1&nav=1&title=scRNA-Seq#R7R3ZcpvK8mtUlTxIxSIh6dF2oiyVODm2Uzk5LykEI4kYAWGxrXz9nZ6FdUBIQoDjm%2BTeIwYYZqZ7eu%2BegXq1fXrn697ms2sie6BI5tNAfTNQFFmdTPB%2FoGVHW6bKjDasfctkDyUNt9YfxBol1hpZJgoyD4aua4eWl200XMdBRphp033ffcw%2BtnLt7Fc9fY0KDbeGbhdbv1tmuGGtsjZPbrxH1nrDPj1TpvTGUjfu174bOex7jusgemer827YHIONbrqPqSb17UC98l03pL%2B2T1fIhmXlK0bfW5TcjYfsIyes88J76eNqcz0L7YfNzfL6%2Fe3i55%2F3Qw6AB92O2FoMFM3GHV6a1gOsrm2tHXJD%2Bx3BUC99sgzxJf61Zv8lry39fAseEumLt5LFCHd87Tfh1sa%2FZHzP1pfIvoyX9Mq1XZ88pC7IH%2FxIEPrufQwkvIiXK9cJGUbJGoxbDzbIZD2SfuKrlWXbqU4R%2BRN3yu8QGKqXhru1DHwp4d9rWw8C9juGo0T6D%2FXQcmGFhnMpnl4aKgxQD8gP0VOqiUHpHXK3KPR3%2BBF2dzbR6CtsMw1nDAsfU6g5ZW2bFFaOJZVtCbYd1nHfCV7gHww1xGjyYFz%2Fc%2Fs9%2Bhi9%2BfPf3Ye7xY%2Fg4b%2Bh%2FKLx5K0Gf8vwZO3rpoWhfQz6NIAtykzKYEtMU1PYMpYE2KLxB0%2FBlj%2Ffwzd30r%2F%2FfVzcvPW3snn3bvdhOJELwEMmprfs0vXDjbt2Hd1%2Bm7SmIAALkzzzyXU9BpZfKAx3DIR6FLq4KYUU6MkK%2F4XXRxN29SN1580T65lc7PiFg%2Bebegkuf6TvJa%2BRq%2BQ98wLYDr40ALYAaWhcWDYfThEBMwhXCvnAjXwDVexFtsdD3V%2BjsOo59iAsfSUi%2BcjGNOwhywpFSEFexfPWd6kHPNdywiDV81doSPBzPJtk8VMZZ9lU7vnJbFb1PP5BR5CgZzyVWhj75e7L%2Fa%2FN7Ofvf%2B9CZfHz%2BrusLYez54KwRaw6PyKNZ00j0km8SN3DiRIgctYDlH8YEDBc4AdkzXsq50vwcK1exsJerpTBxeVWtxwAiuUh28LcADdeptkc%2FcY%2Bflh7JJXzCTzdybfpwAp8tMr0uglDkLYvADrKYm2FmMuOfNsbORg7lIW1Xa5cjCn45%2FW7W5iasnbxxRD%2Fb2m7S%2BDe6AH0ggWfN6YKi8C4ub4I0G%2F4udX9EP8c8fsjzL7dh1167JLBWGXSiFF%2BGPqRY%2Bgh3m2FeVb0Gq%2FrIlkAKfUvhocuEEVE61YCurzg0gU2VvbSDbwxYILGwV3e6UnQbh6sYvLQCJEZEmJshfoSv50iM1j%2FRivLsYhCokivlnDndaXA%2Fezmbbu6GRBeE3jIINwGa%2BEwUdC5XTOyAQ2f%2F%2FwQxnIntHRgh7pjxpMbwrStFRY4CdPXMU9F%2FlETbgAHn5ARhbDeKRwsGUm5MncGHUgTqEBc20mrQPK4ARVILDMJhJTcEmBF0IOfWFH0sMoIk6cCI5fuZrwhtjyNhctVKSXVXkO5sGSKYMV422E6Q0HIHypZpWCa7cBdrQJgQjkgHCboiw0Zky4lff77R0bq36eaDtKKaaKnlqimTWoIslJX12xJQ%2FgWIP%2FL8heYgDGpBCtQPQ1g6T4BRbOcNaVpS9c3kT%2FEzSAaEEmBIT6heBLG8kl8J23jvSCihIrG8Dd%2BwtNNM%2B5b6Z8kRuEXjMKnjEHucOkr29FJEpeAI3GBpgi%2FcuDkmRXaZmeIUQdL8gaBTqpDqfBUDMPUU0o5IMOla%2B4KjQURKzRjPUXfejYaVAljycPF13mLxRsMSu14t5KDRYFUX1at%2FqGxOOaKWSx133DN5qcR9yu98tHvyPIB%2BzBwcf%2FS54ub2%2BEtIHL57GoO5OD5eq5rNz5Z2qn04U3ldF93Ml9MnxuELuuOTdWJtkvkdzIvsNU2PivSqfTKxry%2Fo2kZTWInnxbuNNo6HQOMEO7m5iZhfWkhk%2F%2BDn9IrA%2FMNn2zDR%2FhSJ3MEocJrbo6sOwn8EWtni5ywi0mNRqPmpqQ7IDW64QZ0XMnUQ%2F1cU4JGEUsn0hMRS9KabLWOoaSEX6aUpPQNUMksQ7cvmMc0BCXikvtPbbSCz7j4qZVNHEwr4lNKqxOPGytEt1iqgR4ffd2Lx3WaFq1kdDJ5pglUQpFOKM8PFuTxZUqWP0CBUzp11Tw3BY6D9NkocJ5Q%2FN%2Fi4VsOlfAlL9HZaPuQ7CC4p6XuYaQPh2xXXRAZ36GWsrJ4BN7wNTGnigxsy1pUx8u3bQpPHTVVSh7g5kysZl6mNK9ulM3Ycjl60P2gEaN%2FWZcnKaAFEJ0HIHkhy3cfLKLopC28IkSrZG%2BXsUH4Gr%2BYWRZmoyj9Ln0N5iq0JEuvXC8cHS4PpZZzL4dUqjnkqbyQKfyfyOuNxdlkueNYntbjjvIRQVnHc0eRc7xVuzP%2FYG8Nz%2BMsGM9geBZGmMjqy5Nbmgp4qh3xpPVEiollhIUehP9cEZJcR4D4HWHSF%2B6IDs7VU3cFA9YfybCJbxAGi%2Fnu%2FYB50nL8fBMtR3hvV%2FByxse3eHBA8Rex2xQeA88dvvHbGNHmmNvupeoUTqVUXUR3cpQ%2BS7hVrYS%2Bk0hO1rGURfgGCP1EygWgjYuUXtYEJEo73JlYh9ALQyy5ylVCTpC9dB9boCRKNSk5H0kY1yQJE6lpknBkEGTWQT3WpBxWnBbUKMSReDVfNo40DPpiwKqcg62a82ZTJGVvJUA%2FFYfU6bRRHBKKLT1yl%2B%2FBozKxRe1MbKlLo8Z9CdTO0ygljV%2F78XE8OT8%2Bys8mtaDRSO3aqNSWBFyAlqb52q%2FVxtve6xvVfHKt%2B2t1qPWHehzKhFoAljrvxb6fqrl9n%2BUr%2B59vOEHj9uHi8Z186%2F78cjn%2F8k19%2BrrZ3g97pD3%2FvXxo2gt8PJUPTWYn8aHauv3t3cVNbc3%2BBqvuxGjseRBkdE4dPsm3wBA%2FXIkfpyBzTiW%2B4QjguZxFmo6VdvHaMmk6ZZ0lfhMa3CR9TfwpOVoHvqzUQjNXVtYQAguZdy1vLdMkpDAHk6Tlhq2EOijJbiUJzsQBTskU2714WJNL%2FA%2Bv3hUw1wke5hW%2BlpNr%2FA8e98Mr18Fd6xYBLsII%2B4iCcFBMyG0ADdQZfDxrvSnggdBMr44PxoN6MtALlVi1ukyncQPNacmFWmGL1ncPkkQJ5iSUjnBFf4lCLwK2A%2FUrYINfFJgOvvnbqGI53fmeY7%2BiG0%2FDIhyJxTUm467jPzw9YCZNXnj%2B%2F0LfWjbg%2B3tkPyDoddBGZM2Uc6MKk7KQKh3BnWri%2BbRneC4VUN1HQWSHQT%2BR3cAEOo3i4DS3gELbOwJWDFVFirYWVLGh6WpE%2FjPR0Iw8G%2BMdPF2dmtdqfMKL35Iaz3XZuyUPD3aruSVnvd%2BStrvu6X7EIxvyvWi4to0l81RIPdIN2I%2BcQ%2FULwc%2BAzBO1Rf4ilnqVLqXerkx%2FXJjdK%2FXOO7PTVo67N6SnKPX6CFTJnlKfVsMumUixiFdkEWwsZ%2FczMHxHD9Dvn%2FQGVu5%2FhiTRaBEYI9o2utmap0Vi5rsqDbDMByv2eXHII%2FjGaINsD%2FnB6ObkRRJ2WViYfsWqfsXjxUQv3mywJX0wQqYrTBAmO0qNOI4O%2Feya1gpoFsEM8mAiChtREEJXmr4FRuosAyrHJZGnwLlpoYMQ40ewwcQgoLROkRyETJRGN1F2Yy%2BYeVvS6nxWj8HLSgMcXlzk7fREjF%2FR1uPPM3NjazGOcf029uGkeNtZnTFn8vUXnXS84h93zk%2FqBaU06B%2BxnBXy0ROmfhbJRavrKjGBGmwptQG7MaEJPFadRkeSmEjbWvo6Q%2BSz%2B1JykznYrTJuKzbyvLGQqtKtW%2BX%2B92P4aG23MlI%2Baab0dbwcXg6nz0XVODayOkN7yvSVHEURAL5xIqNpWa5UiHyj5LBAZIrUapyteKtMzxNCN82hMx9w6bjyocCnhdzVJp1BtASDoYJl1nDnoeAo2klMlKQ2E0nrwaQMZKudh9qhmPk5HE4yJ38FyRxLPQwf597O%2Fjslj63Q26hZp7Yzs%2FGQ45OMcMqzib%2FrQ650bSirPYmT4gSDG31Z2fjSOKnc89q4VWb2k7rMDgyYIi%2BRiCmw3mMNIGXVT9xrZ%2BRn6fq1dCKHMzLtWTKyfEjVRCnWg2iTkYnjwV9gOYimTBHTmvRuJveC3k3lrEljMt8jvOeeH8%2FboXdLfXsFxkRy5ExNamdgwkJqLy2t9Xdr3UaSJx6mwYZ5OEWbPkuKlhfNJ1q3ormQolUndv5fMhceTZCmYcJVbUswr00lLkzz2%2BcPl7Ts4R3esBLkiv%2FOBl6IiEa6Lr1uEh8J7mjAnSdJIUXmu7kh6wMfIGndBLN1k5gEiBn1Rq6MAcqWwW9H0MLTwrIdm0noknEfTqNm7dOo3LlBqwn8HRSPCML3luTvIB91je9o5A%2FsY%2Fh%2BCNtm0lQNk5xIpwhKfHVuzv3rJbr%2BmnOztVEm8tHmXDWLZi2Zc%2FmA%2B2bOXSMHXbrmDouFSm2R0OcKMBMlMbl%2BIFQbeoMPImcNrvU2bLnwSSiOiAejHEyIJ9KgdUJ8BmFxPOtWWBRb%2BP5631ej0mLtzNx%2BWvg0tZ8WPiysHWbcw9hHKtXS0DAeS8%2FKGxUE2cCzrTA8d9pkSv7k8zlc5uwgeTInJVaJmmXiabkIegaZU5O6lTkrTVy9CYAtxt6Hvm7c9zT%2BlVqtYMpskMxC%2F2AFUNEsoEfk9j7VpAFszwfqjQUmprNF4gstHz2qnNB%2FCWFWW0JoWkA4KX6fD%2Fso8nX2gso3CGLnHAN4%2BTvkuJlzSiqkhO7pGpVPSKEHDAG8JZ6I%2B3GdmUQ%2B0n2F9DDyc6fxkW%2Bsw1XbMd2FKO2kDjAL0LZdmowYkCkCTlcVWx4cWQ64weq%2BTcgkSoZIT%2BW60dRSA1RavIOLpSEKC3fewr0xDXm5hXvFKnaPHDIxzzzIISO3yEDngiAacYJgvzLg5uV1H3p%2FplwSaRMnZxFMqlYVXtr5a3RJssUpBuT8LbxT2flb1GOWy3HjKUz3mJXDWUj8GvN2Urs%2FrlNA%2FWk6QTZRJWdq19DxW7vACsg6Iy84%2FbScc53k8%2F%2BBnX5Q0HM492fOSxrE4lCxZpI8E7D22fR0aix0AT7%2F7PH97rs6x6%2Fy02rrqaNd8c54mJ1Yzz44dbLH9cfc4WE9sp7xuIy4lswfy%2FOQ%2BbpqZK3oio6%2BpaG3mIc9gE5IjpWnpj1PZ4dWIibC1M3NbVPXdV1bHt3IIxpBsv7T51EqR4zyWZtKc0q4ILq4ZR2c1%2BPtsRMA6odmEuF7RMaW%2BjYhYi%2FC1J9NWlTnNUtNnq%2FoTqdZXU3Y%2BqdtGvu5Y7KDGpWNhANMp4eFA0xnsxyKnSnhB2qf3LACLHVDAhi0QNag1mkiZWDKc2%2B6j84gqUkCE2oh14fVb4E5HBwGME%2Bjx%2FOJeMp76qfjolm8TU%2F9e%2BnjanM9C%2B2Hzc3y%2Bv3t4uef98NyR30BuYAPCPlc3mY1FrGzT2gNom2CvLS7AjtblrIzVrU5BSRhheV0VAZrKhR6Li3sLMKKLEVvhNNlCYlAVBvLFab%2FUxidEAmKpxwuYX8PUodbxgU5oa8VFUxsolTV8KoctxubWOlZdgvynZXegWPBSvPakY2v9CmOZS0vEqcsvnyFk0ay1iJtKVMxKlWJKgFY56LvkfLuIba%2FMyCXzGsE7cMudrBP89jVptOzEr1frs9TuCzP0BnGbO8L6tap2I4vze9VuRgv1ovT24F16F5qgMXMtWzmz0SQM8EjJDMcRjkTh%2BmyzL1FPRMLN2fa62Wg2wsw2OWRUyz%2FiLBTOxN2HlrxHSTfIR8ZQVHKkyp8DHV6EXfyIcSANNIR54k3grwWBSQfjaF5sUhrAZfLhIfCgDTxiN7QxEli%2F0E0XtGEoxSAd4sCLkh8RTXrL4yxEt9P1%2BjZnhBvnP3K%2FMb1rT94fnr8%2FVjGUJvZJAUlYVzcJMpsXtwkZ7Nq%2F%2FX5wEWbdrow7rratpC2cv%2BY2x%2FvrA9vl7e3T%2B7O3X388eXbsK6Rm5sQuq7ykjtdcpo9VXbv87ymx7mN3OyQCWKQ%2BATHYtQu9eJ6OwKP4B4%2BQU7UoPUZYjmBNXJx4YwW760VgA6iO8iN4JrNiljTYru3IrR8lyPbszN851N9px2XbBSegNrxMXIHEUJ88ZVUSibW64aPRT2eFAqSe6pIZvuhyULAF21izxPwchbwyplcvXUhX%2BkSbh%2F0wjjIjqu0ngv0x0L%2B0KDKuqggXPu2QiqrBlnLMB759u4S8m5hOvu4ZYId9Cqkqbnqm%2BH8wD1Xn72Op2ruUNaxoO6Qohb5qzptwPMgrhspdbu1zn4qa1UVtP26QGd1jIXD7lMd4x6f%2Fn4ayM9zMMnBMU6T3LklMyWNPnufV7STDn%2BvZ5GY9Acdz3K2XV1EElNWpReIlLcLqPuqxeafnyinIFJtO8JVFOqm7oW17Qe8JV2Y0Udb9wGsoKQrYpVluWyspk7ccRflHA02xao4unKUe3bWhHwYndrHKrN8l%2FY41j3EysS2r8Hu8eCklxf2nq1woyoyl%2B67C3wXBazkVjm1CoQeX8ZBJ6nCVwvyR8g508ut5eWzrFqVq7P1VoO%2FgxJnUb5gF2vGVM0yGBtfswMoCUvf6CbBAimrvzUU6jmf5tigKAJxKnD%2FqHzJGwcuZ8yl0UgHxBKStyI732LHns1%2F8m5EcoCS7dL8bohDxzwtQo5BE%2BNYkU6bnj5asfvtQiZX0nIHtIT0d0Xoj%2BfRzmm%2BePbIeILT%2BFcQZl2zVf0X5pSfkkGhwGbBvvvPFSVrmIKT4Rw5uWvX3%2BLP%2F9Ez2YPx2X0SCaGq19UHE3IQV8lKsIR681cUhOSIQZqCXzhjFE97BRuULuuD7lsQSUNSLS3HsEfJEhg7g0SCeRs9QK9rjusNySyV0BNeVJ%2BPjRUJ%2BHp1QcEaDm%2Bv38Zj%2Fvb54msChVpfubKjgMOC%2Bossuh67ZPhBtBx6rhfZvKLPIeu6o8zvPi4ym9Rx46tDh1Cz2wvHAfKE9g7vYJx6Y61IRSkoUkQX3kdBwHEiLq9AyjFAkMKQFGIi3JqNBAi%2Fd%2BgAcGORdOTDCnJc56%2FJAZhP8vpRkTEImb7aRBaA0ELbIzX8SKtQI2e%2FdFApfDzOWl20ec5OW79SeNYw3FalcD7gvlUK%2FxzZoYU5b%2B2suWiLSTZhoTHDRu0diFUzSW6vf6VV5b5A82rtmLJqtjOBfNz1CQpyp7ECXdb8PNRDKFw%2BRtt6U3SlapD9rPj5NZ2l5aysdeQXJNIeGXNo3U7%2BBZDlNz5aZT5QIKGYio982xs5kIFTSkaHCSk1qXa1oAsC7fDYiF4mVTZTySFM608aMdYPQz9yIOTVLE5D3CFfW73Oep9%2FdbMG6HZXHKsJ%2BhY0v0aWW9Bb1VqLJp4vHdv2gpDjMxtZjFxPPUC6OAK%2BpIHqhMC5Xf%2FedokdZ4u1xzXRf1PjFemB5d0e0YCVV8KSggO%2B%2B6q5AeKF0LHYCQyHjYCD3I78TFYbCJgklxor9oMk4F8K9n8EUBQr6sjAYoOftdW5BmjxKQDWmtfr2JYiuptUOY4TwuPtGr%2BIBcGAFT6mDelSyLSWY2JMoAiOf3x2A2poChHYG%2B4RAvxNFjFA5KyKpK5jyUxa3QZnTEw2fde746ISDK48Nf40yVvJCN4TVZSKK6oXNW0ge0csZosM07mVPCl9ec96lcuFpYv43NOX8SVmMWH6cYxgm8%2BuCSr12%2F8B). Specify desired analysis details for your data in the respective *essential.vars.groovy* file (see below) and run the selected pipeline *marsseq.pipeline.groovy* or *smartsseq.pipeline.groovy* as described [here](https://gitlab.rlp.net/imbforge/NGSpipe2go/-/blob/master/README.md). The analysis allows further parameter fine-tuning subsequent the initial analysis e.g. for plotting and QC thresholding. Therefore, a customisable *sc.report.Rmd* file will be generated in the output reports folder after running the pipeline. Go through the steps and modify the default settings where appropriate. Subsequently, the *sc.report.Rmd* file can be converted to a final html report using the *knitr* R-package. ### The pipelines includes: - FastQC, MultiQC and other tools for rawdata quality control - Adapter trimming with Cutadapt - Mapping to the genome using STAR - generation of bigWig tracks for visualisation of alignment - Quantification with featureCounts (Subread) and UMI-tools (if UMIs are used for deduplication) - Downstream analysis in R using a pre-designed markdown report file (*sc.report.Rmd*). Modify this file to fit your custom parameter and thresholds and render it to your final html report. The Rmd file uses, among others, the following tools and methods: - QC: the [scater](http://bioconductor.org/packages/release/bioc/html/scater.html) package. - Normalization: the [scran](http://bioconductor.org/packages/release/bioc/html/scran.html) package. - Differential expression analysis: the [scde](http://bioconductor.org/packages/release/bioc/html/scde.html) package. - Trajectory analysis (pseudotime): the [monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html) package. ### Pipeline parameter settings - essential.vars.groovy: essential parameter describing the experiment - project folder name - reference genome - experiment design - adapter sequence, etc. - additional (more specialized) parameter can be given in the var.groovy-files of the individual pipeline modules - targets.txt: comma-separated txt-file giving information about the analysed samples. The following columns are required - sample: sample identifier. Must be a unique substring of the input sample file name (e.g. common prefixes and suffixes may be removed). These names are grebbed against the count file names to merge targets.txt to the count data. - plate: plate ID (number) - row: plate row (letter) - col: late column (number) - cells: 0c/1c/10c (control wells) - group: default variable for cell grouping (e.g. by condition) for pool-based libraries like MARSseq required additionally: - pool: the pool ID comprises all cells from 1 library pool (i.e. a set of unique cell barcodes; the cell barcodes are re-used in other pools). Must be a unique substring of the input sample file name. For pool-based design, the pool ID is grebbed against the respective count data filename instead of the sample name as stated above. - barcode: cell barcodes used as cell identifier in the count files. After merging the count data with targets.txt, the barcodes are replaced with sample IDs given in the sample column (i.e. here, sample names need not be a substring of input sample file name). ### Programs required - FastQC - STAR - Samtools - Bedtools - Subread - Picard - UCSC utilities - RSeQC - UMI-tools - R ## Resources - QC: the [scater](http://bioconductor.org/packages/release/bioc/html/scater.html) package. - Normalization: the [scran](http://bioconductor.org/packages/release/bioc/html/scran.html) package. - Trajectory analysis (pseudotime): the [monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html) package. - A [tutorial](https://scrnaseq-course.cog.sanger.ac.uk/website/index.html) from Hemberg lab - Luecken and Theis 2019 [Current best practices in single‐cell RNA‐seq analysis: a tutorial](https://www.embopress.org/doi/10.15252/msb.20188746) """ ; ns1:keywords "scRNA-seq, MARS-seq, bpipe, groovy" ; ns1:license ; ns1:name "scRNA-seq MARS-seq" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Bpipe" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-22T00:11:33Z"^^ns1:Date ; ns1:dateModified "2023-10-22T00:11:33Z"^^ns1:Date ; ns1:description """## Summary This pipeline contains the following functions: (1) Data processing to handle the tansformations needed to obtain the original pathway scores of the samples according to single sample analysis GSEA (2) Model training based on the disease and healthy sample pathway scores, to classify them (3) Scoring matrix weights optimization according to a gold standard list of drugs (those that went on clinical trials or are approved for the disease).It tests the weights in a range of 0 to 30 (you may change as you want). The evaluation function tests and try to maximize the number of approved drugs whose modified pathway scores for disease samples is changed from disease to healthy sample classification, according to the trained model. (4) Computation of the calibrated disease samples pathwa scores according to the interaction among drug and targets found in the sample pathways & Drug ranking based on the disease samples whose calibrated matrix were responsible to change the trained model decision from disease to healthy state. (5) Drug combination ranking evaluated the same way as in option (4) but adding the effects of multiple drugs in each sample while calculating the calibrated scoring matrix ## Input configuration file: * The pipeline only needs a configuration file and the step number you want to run. - Configuration file keys (see also the example in config.json): - **identifier**: project identifier to be used in the result files - **type_normalization**: normalization type (possible values: tpm, fpkm, tmm, cpm or fpkm_uq) - **genome_assembly**: the supported assemblies are the 37 and 38 (values may be: g37 or g38) - **pathway_geneset**: pathway-based gene sets, choose one identifier from the list in [genesets_available.txt](https://github.com/YasCoMa/caliscoma_pipeline/blob/master/genesets_available.txt) - **folder**: working directory - **expression_file**: compressed gene expression file for the desired icgc project, it must be separated by tabulation. The following columns are mandatory: submitted_file_id (sample names), raw_read_count (the read counts without normalization) and gene_id (genes in ensembl or hgnc symbol). File expected to be in {folder}. - **labels_file** (optional for function 1): file with two columns, one named 'sample' corresponding to the unique values of submitted_sample_id; the second named 'label' corresponding to a disease (or confirmed tumour) (1) or a healthy (0) case. File expected to be in {folder}. - **trained_model** (optional for function 1): file with the trained model to separate healthy and disease cases. Full path is expected. - **means_table_file** (optional for function 1): file with the means table calculated when the model is trained by the function 3. Full path is expected. - **samples_pathway_scores** (optional for function 1): file with the original model calculated pathway scores by function 1, in order to check the number of features expected by the original model. Full path is expected. - **optimized_weights_file**: tab separated table file with two columns representing the weights (w1, w2, w3) and their respective values. - **drug_list_file** (only mandatory for function 3): file with the gold standard drug list (one drugbank id per line), this file is expected to be in the in the experiment item folder results ({folder}/{identifier}) - **drug_combination_file** (only mandatory for function 5): file with the drug combination candidates list (drugbank ids concatenated with comma in each line). Full path is expected. - Observation: * The "labels_file" parameter is mandatory for the weights optimization, scoring matrix calculation, model traning and drug (or drug combination) ranking * In case of transfer learning, "labels_file" may be ignored only if both "trained_model", "means_table_file" and "samples_pathway_scores" are present. This is only possible for the functions 2, 4 and 5. For weights optimization, only labels file is accepted. * If type_normalization and/or genome_assembly are missing or empty, it will switch to the default fpkm_uq * If pathway_geneset is missing or empty, it will switch to the default KEGG_2021_HUMAN * If optimized_weights_file is missing or empty, it will switch to the default values (w1: 20, w2: 5, w3: 10) ## Usage Instructions ### Preparation: 1. ````git clone https://github.com/YasCoMa/caliscoma_pipeline.git```` 2. ````cd caliscoma_pipeline```` 3. Create conda environment to handle dependencies: ````conda env create -f drugresponse_env.yml```` 4. ````conda activate drugresponse_env```` 5. Setup an environment variable named "path_workflow" with the full path to this workflow folder ### Getting data for the running example in the LICA-FR and LIRI-JP projects from ICGC 1. Download the [expression file for LICA-FR](https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LICA-FR/exp_seq.LICA-FR.tsv.gz) and put it in data_icgc folder 2. Download the [expression file for LIRI-JP](https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LIRI-JP/exp_seq.LIRI-JP.tsv.gz) and put it in data_icgc folder 3. For the liri-jp project, the labels file is already processed, to given an example of a project that run all steps proposed by this workflow ### Run analysis - Run all steps: ````python3 main.py -rt 0 -cf config.json```` - Run all steps: ````python3 main.py -rt 0 -cf config_transfer_options.json```` - Run only data processing: ````python3 main.py -rt 1 -cf config.json```` - Run only data processing: ````python3 main.py -rt 1 -cf config_transfer_options.json```` - Run only model training & modified pathway score matrix: ````python3 main.py -rt 2 -cf config.json```` - Run only model training & modified pathway score matrix: ````python3 main.py -rt 2 -cf config_transfer_options.json```` - Run only weights optimization: ````python3 main.py -rt 3 -cf config.json```` - Run only drug ranking: ````python3 main.py -rt 4 -cf config.json```` - Run only drug ranking: ````python3 main.py -rt 4 -cf config_transfer_options.json```` - Run only drug combination evaluation: ````python3 main.py -rt 5 -cf config.json```` - Run only drug combination evaluation: ````python3 main.py -rt 5 -cf config_transfer_options.json```` ## Reference Martins, Y. C. (2023). Multi-task analysis of gene expression data on cancer public datasets. medRxiv, 2023-09. ## Bug Report Please, use the [Issues](https://github.com/YasCoMa/caliscoma_pipeline/issues) tab to report any bug.""" ; ns1:keywords "Workflows, durg response simulation, gene set enrichment analysis, personalized medicine, data retrieval and transformation" ; ns1:license ; ns1:name "DReCaS - Pipeline for drug ranking based on computed pathway scores of disease and healthy samples" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-22T00:18:50Z"^^ns1:Date ; ns1:dateModified "2023-10-22T00:19:15Z"^^ns1:Date ; ns1:description """## Summary The data preparation pipeline contains tasks for two distinct scenarios: [leukaemia](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE425) that contains microarray data for 119 patients and [ovarian](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE140082) cancer that contains next generation sequencing data for 380 patients. The disease outcome prediction pipeline offers two strategies for this task: **Graph kernel method**: It starts generating personalized networks for each patient using the interactome file provided and generate the patient network checking if each PPI of the interactome has both proteins up regulated or down regulated according to the gene expression table provided. The first step generate a set of graphs for the patients that are evaluated with 4 distinct kernels for graph classification, which are: Linear kernel between edge histograms, Linear kernel between vertex histograms and the Weisfeiler lehman. These kernels functions calculate a similarity matrix for the graphs and then this matrix is used by the support vector machine classifier. Then the predictions are delivered to the last task that exports a report with the accuracy reached by each kernel. It allows some customizations about the network parameters to be used, such as the DEG cutoff to determine up and down regulated based on the log2 fold change, which will determine the topology and the labels distribution in the specific sample graphs. It is also possible customize the type of node/edge attributes passed to the kernel function, which may be only label, only weight or both. **GSEA based pathway scores method**: This method is faster and do not rely on tensor inputs such as the previous method. It uses geneset enrichment analysis on the pathways from KEGG 2021 of Human, and uses the scores of the pathways found enriched for the samples to build the numerical features matrix, that is then delivered to the AdaBoost classifier. The user may choose balance the dataset using oversampling strategy provided by SMOTE. ## Usage Instructions ### Preparation: 1. ````git clone https://github.com/YasCoMa/screendop.git```` 2. ````cd screendop```` 3. Decompress screening_ovarian/raw_expression_table.tsv.tar.xz 4. Create conda environment to handle dependencies: ````conda env create -f drugresponse_env.yml```` 5. ````conda activate drugresponse_env```` 6. Setup an environment variable named "path_workflow_screendop" with the full path to this workflow folder ### Data preparation - File ````data_preparation_for_pipeline.py```` : #### Files decompression - Decompress data_preparation/lekaemia.tar.xz - Decompress data_preparation/ovarian/GSE140082_data.tar.xz - Put the decompressed file GSE140082_series_matrix.txt in data_preparation/ovarian/ #### Pipeline parameters - __-rt__ or __--running_type__
Use to prepare data for the desired scenario:
1 - Run with Leukaemia data
2 - Run with Ovarian cancer data #### Running modes examples 1. Run for Leukaemia data:
````python3 data_preparation_for_pipeline.py -rt 1 ```` In this case, you must have [R](https://www.r-project.org/) installed and also the library [limma](https://bioconductor.org/packages/release/bioc/html/limma.html), it is used to determine DEGs from microarray data. For this dataset, the files are already prepared in the folder. 2. Run for Ovarian cancer data:
````python3 data_preparation_for_pipeline.py -rt 2 ```` In this case, you must have [R](https://www.r-project.org/) installed and also the library [DESeq](https://bioconductor.org/packages/release/bioc/html/DESeq.html), because this scenario treats next generation sequencing data ### Disease outcome prediction execution - File ````main.py````: #### Pipeline parameters - __-rt__ or __--running_step__
Use to prepare data for the desired scenario:
1 - Run graph kernel method
2 - Run gsea based pathway scores method - __-cf__ or __--configuration_file__
File with the expression values for the genes by sample/patient in tsv format
Example of this file: config.json #### Input configuration file - Configuration file keys (see also the example in config.json): - **folder** (mandatory for both methods): working directory - **identifier**: project identifier to be used in the result files - **mask_expression_table** (mandatory for both methods): Gene expression values file with the result of the fold change normalized value of a certain gene for each sample, already pruned by the significance (p-value). - **raw_expression_table** (mandatory for both methods): Raw gene expression values already normalized following the method pf preference of the user. - **labels_file** (mandatory for both methods): File with the prognosis label for each sample - **deg_cutoff_up**: Cutoff value to determine up regulated gene. Default value is 1. - **deg_cutoff_down**: Cutoff value to determine down regulated gene. Default value is -1. - **nodes_enrichment**: Node attributes to be used in the screening evaluation. It may be a list combining the options "label", "weight" or "all". Examples: ["all", "weight"], ["label"], ["label", "weight"]. Default value is ["all"]. - **edges_enrichment**: Edge attributes to be used in the screening evaluation. It may be a list combining the options "label", "weight" or "all". Examples: ["all", "weight"], ["label"], ["label", "weight"]. Default value is ["all"]. - **flag_balance**: Flag to indicate whether the user wants to balance the samples in each outcome class, by SMOTE oversampling. Values may be false or true. Default value is false. #### Running modes examples 1. Running disease outcome prediction by graph kernel method:
````python3 main.py -rt 1 -cf config.json```` 2. Running disease outcome prediction by gsea enriched network method:
````python3 main.py -rt 2 -cf config.json```` ## Reference Martins, Y. C. (2023). Multi-task analysis of gene expression data on cancer public datasets. medRxiv, 2023-09. ## Bug Report Please, use the [Issue](https://github.com/YasCoMa/screendop/issues) tab to report any bug.""" ; ns1:keywords "Bioinformatics, personalized medicine, gene set enrichment analysis, disease outcome prediction, public cancer datasets exploration, data wrangling, data transformation" ; ns1:license ; ns1:name "ScreenDOP - Screening of strategies for disease outcome prediction" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Column number with SRA ID" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Column number with final identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SRA_manifest" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/paired_output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/single_output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-04-11T02:02:36Z"^^ns1:Date ; ns1:dateModified "2026-04-11T02:02:36Z"^^ns1:Date ; ns1:description "This workflow takes as input a SRA_manifest from SRA Run Selector and will generate one fastq file or fastq pair of file for each experiment (concatenated multiple runs if necessary). Output will be relabelled to match the column specified by the user." ; ns1:input , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "sra-manifest-to-concatenated-fastqs/main" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 12 . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Igor Makunin and Mike Thang for help with the workflow" . a ns1:Person ; ns1:name "The workflow is based on the Galaxy Training tutorial Analyses of metagenomics data. Thank you to the Galaxy Australia team" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , ; ns1:dateCreated "2023-10-26T19:34:08Z"^^ns1:Date ; ns1:dateModified "2024-04-05T05:20:07Z"^^ns1:Date ; ns1:description """The aim of this workflow is to handle the routine part of shotgun metagenomics data processing on Galaxy Australia. The workflow is using the tools MetaPhlAn2 for taxonomy classification and HUMAnN2 for functional profiling of the metagenomes. The workflow is based on the Galaxy Training tutorial 'Analyses of metagenomics data - The global picture' (Saskia Hiltemann, Bérénice Batut) https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/general-tutorial/tutorial.html#shotgun-metagenomics-data. The how-to guide is available here: https://vmurigneu.github.io/shotgun_howto_ga_workflows/ """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.624.1" ; ns1:image ; ns1:keywords "Metagenomics, GUCFG2galaxy, shotgun" ; ns1:license ; ns1:name "Analyses of shotgun metagenomics data with MetaPhlAn2" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-04-02T02:01:43Z"^^ns1:Date ; ns1:dateModified "2026-04-02T02:01:44Z"^^ns1:Date ; ns1:description "This workflow performs the scaffolding of a genome assembly using HiC data with YAHS. Can be used on any assembly with Hi-C data, and the assembly in the gfa format." ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Scaffolding-HiC-VGP8/main" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 31 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-27T08:14:19Z"^^ns1:Date ; ns1:dateModified "2023-10-27T12:05:57Z"^^ns1:Date ; ns1:description """# MMV Im2Im Transformation [![Build Status](https://github.com/MMV-Lab/mmv_im2im/workflows/Build%20Main/badge.svg)](https://github.com/MMV-Lab/mmv_im2im/actions) A generic python package for deep learning based image-to-image transformation in biomedical applications The main branch will be further developed in order to be able to use the latest state of the art techniques and methods in the future. To reproduce the results of our manuscript, we refer to the branch [paper_version](https://github.com/MMV-Lab/mmv_im2im/tree/paper_version). (We are actively working on the documentation and tutorials. Submit a feature request if there is anything you need.) --- ## Overview The overall package is designed with a generic image-to-image transformation framework, which could be directly used for semantic segmentation, instance segmentation, image restoration, image generation, labelfree prediction, staining transformation, etc.. The implementation takes advantage of the state-of-the-art ML engineering techniques for users to focus on researches without worrying about the engineering details. In our pre-print [arxiv link](https://arxiv.org/abs/2209.02498), we demonstrated the effectiveness of *MMV_Im2Im* in more than ten different biomedical problems/datasets. * For computational biomedical researchers (e.g., AI algorithm development or bioimage analysis workflow development), we hope this package could serve as the starting point for their specific problems, since the image-to-image "boilerplates" can be easily extended further development or adapted for users' specific problems. * For experimental biomedical researchers, we hope this work provides a comprehensive view of the image-to-image transformation concept through diversified examples and use cases, so that deep learning based image-to-image transformation could be integrated into the assay development process and permit new biomedical studies that can hardly be done only with traditional experimental methods ## Installation Before starting, we recommend to [create a new conda environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) or [a virtual environment](https://docs.python.org/3/library/venv.html) with Python 3.9+. Please note that the proper setup of hardware is beyond the scope of this pacakge. This package was tested with GPU/CPU on Linux/Windows and CPU on MacOS. [Special note for MacOS users: Directly pip install in MacOS may need [additional setup of xcode](https://developer.apple.com/forums/thread/673827).] ### Install MONAI To reproduce our results, we need to install MONAI's code version of a specific commit. To do this: ``` git clone https://github.com/Project-MONAI/MONAI.git cd ./MONAI git checkout 37b58fcec48f3ec1f84d7cabe9c7ad08a93882c0 pip install . ``` We will remove this step for the main branch in the future to ensure a simplified installation of our tool. ### Install MMV_Im2Im for basic usage: (For users only using this package, not planning to change any code or make any extension): **Option 1: core functionality only** `pip install mmv_im2im`
**Option 2: advanced functionality (core + logger)** `pip install mmv_im2im[advance]`
**Option 3: to reproduce paper:** `pip install mmv_im2im[paper]`
**Option 4: install everything:** `pip install mmv_im2im[all]`
For MacOS users, additional ' ' marks are need when using installation tags in zsh. For example, `pip install mmv_im2im[paper]` should be `pip install mmv_im2im'[paper]'` in MacOS. ### Install MMV_Im2Im for customization or extension: ``` git clone https://github.com/MMV-Lab/mmv_im2im.git cd mmv_im2im pip install -e .[all] ``` Note: The `-e` option is the so-called "editable" mode. This will allow code changes taking effect immediately. The installation tags, `advance`, `paper`, `all`, are be selected based on your needs. ### (Optional) Install using Docker It is also possible to use our package through [docker](https://www.docker.com/). The installation tutorial is [here](docker/tutorial.md). ### (Optional) Use MMV_Im2Im with Google Colab We provide a web-based demo, if cloud computing is preferred. you can [![Open a 2D labelfree DEMO in Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MMV-Lab/mmv_im2im/blob/main/tutorials/colab/labelfree_2d.ipynb). The same demo can de adapted for different applications. ## Quick start You can try out on a simple example following [the quick start guide](tutorials/quick_start.md) Basically, you can specify your training configuration in a yaml file and run training with `run_im2im --config /path/to/train_config.yaml`. Then, you can specify the inference configuration in another yaml file and run inference with `run_im2im --config /path/to/inference_config.yaml`. You can also run the inference as a function with the provided API. This will be useful if you want to run the inference within another python script or workflow. Here is an example: ``` from pathlib import Path from aicsimageio import AICSImage from aicsimageio.writers import OmeTiffWriter from mmv_im2im.configs.config_base import ProgramConfig, parse_adaptor, configuration_validation from mmv_im2im import ProjectTester # load the inference configuration cfg = parse_adaptor(config_class=ProgramConfig, config="./paper_configs/semantic_seg_2d_inference.yaml") cfg = configuration_validation(cfg) # define the executor for inference executor = ProjectTester(cfg) executor.setup_model() executor.setup_data_processing() # get the data, run inference, and save the result fn = Path("./data/img_00_IM.tiff") img = AICSImage(fn).get_image_data("YX", Z=0, C=0, T=0) # or using delayed loading if the data is large # img = AICSImage(fn).get_image_dask_data("YX", Z=0, C=0, T=0) seg = executor.process_one_image(img) OmeTiffWriter.save(seg, "output.tiff", dim_orders="YX") ``` ## Tutorials, examples, demonstrations and documentations The overall package aims to achieve both simplicty and flexibilty with the modularized image-to-image boilerplates. To help different users to best use this package, we provide documentations from four different aspects: * [Examples (i.e., scripts and config files)](tutorials/example_by_use_case.md) for reproducing all the experiments in our [pre-print](https://arxiv.org/abs/2209.02498) * A bottom-up tutorials on [how to understand the modularized image-to-image boilerplates](tutorials/how_to_understand_boilerplates.md) (for extending or adapting the package) and [how to understand the configuration system in details](tutorials/how_to_understand_config.md) (for advance usage to make specific customization). * A top-down tutorials as [FAQ](tutorials/FAQ.md), which will continuously grow as we receive more questions. * All the models used in the manuscript and sample data can be found here: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10034416.svg)](https://doi.org/10.5281/zenodo.10034416) ### Contribute models to [BioImage Model Zoo](https://bioimage.io/#/) We highly appreciate the BioImage Model Zoo's initiative to provide a comprehensive collection of pre-trained models for a wide range of applications. To make MMV_Im2Im trained models available as well, the first step involves extracting the state_dict from the PyTorch Lightning checkpoint. This can be done via: ```python import torch ckpt_path = "./lightning_logs/version_0/checkpoints/last.ckpt" checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu')) state_dict = checkpoint['state_dict'] torch.save(state_dict, "./state_dict.pt") ``` All further steps to provide models can be found in the [official documentation](https://bioimage.io/docs/#/contribute_models/README). ## Development See [CONTRIBUTING.md](CONTRIBUTING.md) for information related to developing the code. **MIT license** """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.626.1" ; ns1:keywords "Machine Learning, Python, image processing, Electron microscopy, imaging, jupyter" ; ns1:license ; ns1:name "MMV_Im2Im" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-10-27T13:22:34Z"^^ns1:Date ; ns1:dateModified "2023-11-24T08:43:59Z"^^ns1:Date ; ns1:description """**Name:** Matrix multiplication with Objects **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Matrix multiplication is a binary operation that takes a pair of matrices and produces another matrix. If A is an n×m matrix and B is an m×p matrix, the result AB of their multiplication is an n×p matrix defined only if the number of columns m in A is equal to the number of rows m in B. When multiplying A and B, the elements of the rows in A are multiplied with corresponding columns in B. In this implementation, A and B are square matrices (same number of rows and columns), and so it is the result matrix C. Each matrix is divided in N blocks of M doubles. The multiplication of two blocks is done by a multiply task method with a simple three-nested-loop implementation. When executed with COMPSs, the main program generates N^3^ tasks arranged as N^2^ chains of N tasks in the dependency graph. # Execution instructions Usage: ``` runcompss --lang=python src/matmul_objects.py numberOfBlocks blockSize ``` where: * numberOfBlocks: Number of blocks inside each matrix * blockSize: Size of each block # Execution Examples ``` runcompss --lang=python src/matmul_objects.py 16 4 runcompss src/matmul_objects.py 16 4 python -m pycompss src/matmul_objects.py 16 4 ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.627.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Tutorial, Example, Laptop, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Matrix Multiplication with Objects (inputs generated by the code)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Estimated normal tissue purity" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Estimated tumor tissue purity" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ID of matched normal sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NORMAL sample forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NORMAL sample reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Patient sex" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Regions of interest" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Sample ID (tumor tissue)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/TUMOR sample forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/TUMOR sample reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/VarScan Output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/analysis_metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_html_normal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_html_tumor" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_normal_genome_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/bamqc_tumor_genome_results" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_normal_fw_after" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_normal_fw_before" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_normal_rv_after" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_normal_rv_before" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_tumor_fw_after" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_tumor_fw_before" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_tumor_rv_after" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/fastqc_html_tumor_rv_before" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/multiqc_html_post_trim" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/multiqc_html_pre_trim" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-27T13:48:44Z"^^ns1:Date ; ns1:dateModified "2025-05-08T10:01:22Z"^^ns1:Date ; ns1:description """Call somatic, germline and LoH event variants from PE Illumina sequencing data obtained from matched pairs of tumor and normal tissue samples. This workflow can be used with whole-genome and whole-exome sequencing data as input. For WES data, parts of the analysis can be restricted to the exome capture kits target regions by providing the optional "Regions of Interest" bed dataset. The current version uses bwa-mem for read mapping and varscan somatic for variant calling and somatic status classification.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.628.1" ; ns1:input , , , , , , , , , ; ns1:keywords "EOSC4Cancer" ; ns1:license ; ns1:name "Variant calling from matched tumor/normal sample pair (hg38 version)" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotations data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Export folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Report germline variants?" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sample metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Study ID" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variants to be annotated" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "annotation_metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cgi_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "civic_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "final_variants" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_germline" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_loh" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_cards_somatic" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gene_reports_tabular" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "germline_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loh_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "loh_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "maf_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mutations_summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "somatic_cancer_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "somatic_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "uniprot_cancer_genes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "variant_reports_tabular" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-10-27T15:07:03Z"^^ns1:Date ; ns1:dateModified "2025-05-08T10:00:45Z"^^ns1:Date ; ns1:description """A variation of the Cancer variant annotation (hg38 VEP-based) workflow at https://doi.org/10.48546/workflowhub.workflow.607.1. Like that other workflow it takes a list of tumor/normal sample pair variants in VCF format (see the other workflow for details about the expected format) and 1. annotates them using the ENSEMBL Variant Effect Predictor and custom annotation data 2. turns the annotated VCF into a MAF file for import into cBioPortal 3. generates human-readable variant- and gene-centric reports In addition, this worklfow exports the resulting MAF dataset to a WebDAV-enabled remote folder for subsequent import into cBioPortal. WebDAV access details can be configured in the Galaxy user preferences.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.629.1" ; ns1:input , , , , , ; ns1:keywords "EOSC4Cancer" ; ns1:license ; ns1:name "Cancer variant annotation (hg38 VEP-based) with MAF export" ; ns1:output , , , , , , , , , , , , , , , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-10-07T07:47:46Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:52Z"^^ns1:Date ; ns1:description """# scRNA-Seq pipelines Here we forge the tools to analyze single cell RNA-Seq experiments. The analysis workflow is based on the Bioconductor packages [*scater*](https://bioconductor.org/packages/devel/bioc/vignettes/scater/inst/doc/overview.html) and [*scran*](https://bioconductor.org/packages/devel/bioc/vignettes/scran/inst/doc/scran.html) as well as the Bioconductor workflows by Lun ATL, McCarthy DJ, & Marioni JC [*A step-by-step workflow for low-level analysis of single-cell RNA-seq data.*](http://doi.org/10.12688/f1000research.9501.1) F1000Res. 2016 Aug 31 [revised 2016 Oct 31];5:2122 and Amezquita RA, Lun ATL et al. [*Orchestrating Single-Cell Analysis with Bioconductor*](https://osca.bioconductor.org/index.html) Nat Methods. 2020 Feb;17(2):137-145. ## Implemented protocols - MARS-Seq (massively parallel single-cell RNA-sequencing): The protocol is based on the publications of Jaitin DA, et al. (2014). *Massively parallel single-cell RNA-seq for marker-free decomposition of tissues into cell types.* Science (New York, N.Y.), 343(6172), 776–779. https://doi.org/10.1126/science.1247651 and Keren-Shaul H., et al. (2019). *MARS-seq2.0: an experimental and analytical pipeline for indexed sorting combined with single-cell RNA sequencing.* Nature Protocols. https://doi.org/10.1038/s41596-019-0164-4. The MARS-Seq library preparation protocol is given [here](https://github.com/imbforge/NGSpipe2go/blob/master/resources/MARS-Seq_protocol_Step-by-Step_MML.pdf). The sequencing reads are demultiplexed according to the respective pool barcodes before they are used as input for the analysis pipeline. - Smart-seq2: Libraries are generated using the [Smart-seq2 kit](http://www.nature.com/nmeth/journal/v10/n11/full/nmeth.2639.html). ## Pipeline Workflow All analysis steps are illustrated in the pipeline [flowchart](https://www.draw.io/?lightbox=1&highlight=0000ff&edit=_blank&layers=1&nav=1&title=scRNA-Seq#R7R3ZcpvK8mtUlTxIxSIh6dF2oiyVODm2Uzk5LykEI4kYAWGxrXz9nZ6FdUBIQoDjm%2BTeIwYYZqZ7eu%2BegXq1fXrn697ms2sie6BI5tNAfTNQFFmdTPB%2FoGVHW6bKjDasfctkDyUNt9YfxBol1hpZJgoyD4aua4eWl200XMdBRphp033ffcw%2BtnLt7Fc9fY0KDbeGbhdbv1tmuGGtsjZPbrxH1nrDPj1TpvTGUjfu174bOex7jusgemer827YHIONbrqPqSb17UC98l03pL%2B2T1fIhmXlK0bfW5TcjYfsIyes88J76eNqcz0L7YfNzfL6%2Fe3i55%2F3Qw6AB92O2FoMFM3GHV6a1gOsrm2tHXJD%2Bx3BUC99sgzxJf61Zv8lry39fAseEumLt5LFCHd87Tfh1sa%2FZHzP1pfIvoyX9Mq1XZ88pC7IH%2FxIEPrufQwkvIiXK9cJGUbJGoxbDzbIZD2SfuKrlWXbqU4R%2BRN3yu8QGKqXhru1DHwp4d9rWw8C9juGo0T6D%2FXQcmGFhnMpnl4aKgxQD8gP0VOqiUHpHXK3KPR3%2BBF2dzbR6CtsMw1nDAsfU6g5ZW2bFFaOJZVtCbYd1nHfCV7gHww1xGjyYFz%2Fc%2Fs9%2Bhi9%2BfPf3Ye7xY%2Fg4b%2Bh%2FKLx5K0Gf8vwZO3rpoWhfQz6NIAtykzKYEtMU1PYMpYE2KLxB0%2FBlj%2Ffwzd30r%2F%2FfVzcvPW3snn3bvdhOJELwEMmprfs0vXDjbt2Hd1%2Bm7SmIAALkzzzyXU9BpZfKAx3DIR6FLq4KYUU6MkK%2F4XXRxN29SN1580T65lc7PiFg%2Bebegkuf6TvJa%2BRq%2BQ98wLYDr40ALYAaWhcWDYfThEBMwhXCvnAjXwDVexFtsdD3V%2BjsOo59iAsfSUi%2BcjGNOwhywpFSEFexfPWd6kHPNdywiDV81doSPBzPJtk8VMZZ9lU7vnJbFb1PP5BR5CgZzyVWhj75e7L%2Fa%2FN7Ofvf%2B9CZfHz%2BrusLYez54KwRaw6PyKNZ00j0km8SN3DiRIgctYDlH8YEDBc4AdkzXsq50vwcK1exsJerpTBxeVWtxwAiuUh28LcADdeptkc%2FcY%2Bflh7JJXzCTzdybfpwAp8tMr0uglDkLYvADrKYm2FmMuOfNsbORg7lIW1Xa5cjCn45%2FW7W5iasnbxxRD%2Fb2m7S%2BDe6AH0ggWfN6YKi8C4ub4I0G%2F4udX9EP8c8fsjzL7dh1167JLBWGXSiFF%2BGPqRY%2Bgh3m2FeVb0Gq%2FrIlkAKfUvhocuEEVE61YCurzg0gU2VvbSDbwxYILGwV3e6UnQbh6sYvLQCJEZEmJshfoSv50iM1j%2FRivLsYhCokivlnDndaXA%2Fezmbbu6GRBeE3jIINwGa%2BEwUdC5XTOyAQ2f%2F%2FwQxnIntHRgh7pjxpMbwrStFRY4CdPXMU9F%2FlETbgAHn5ARhbDeKRwsGUm5MncGHUgTqEBc20mrQPK4ARVILDMJhJTcEmBF0IOfWFH0sMoIk6cCI5fuZrwhtjyNhctVKSXVXkO5sGSKYMV422E6Q0HIHypZpWCa7cBdrQJgQjkgHCboiw0Zky4lff77R0bq36eaDtKKaaKnlqimTWoIslJX12xJQ%2FgWIP%2FL8heYgDGpBCtQPQ1g6T4BRbOcNaVpS9c3kT%2FEzSAaEEmBIT6heBLG8kl8J23jvSCihIrG8Dd%2BwtNNM%2B5b6Z8kRuEXjMKnjEHucOkr29FJEpeAI3GBpgi%2FcuDkmRXaZmeIUQdL8gaBTqpDqfBUDMPUU0o5IMOla%2B4KjQURKzRjPUXfejYaVAljycPF13mLxRsMSu14t5KDRYFUX1at%2FqGxOOaKWSx133DN5qcR9yu98tHvyPIB%2BzBwcf%2FS54ub2%2BEtIHL57GoO5OD5eq5rNz5Z2qn04U3ldF93Ml9MnxuELuuOTdWJtkvkdzIvsNU2PivSqfTKxry%2Fo2kZTWInnxbuNNo6HQOMEO7m5iZhfWkhk%2F%2BDn9IrA%2FMNn2zDR%2FhSJ3MEocJrbo6sOwn8EWtni5ywi0mNRqPmpqQ7IDW64QZ0XMnUQ%2F1cU4JGEUsn0hMRS9KabLWOoaSEX6aUpPQNUMksQ7cvmMc0BCXikvtPbbSCz7j4qZVNHEwr4lNKqxOPGytEt1iqgR4ffd2Lx3WaFq1kdDJ5pglUQpFOKM8PFuTxZUqWP0CBUzp11Tw3BY6D9NkocJ5Q%2FN%2Fi4VsOlfAlL9HZaPuQ7CC4p6XuYaQPh2xXXRAZ36GWsrJ4BN7wNTGnigxsy1pUx8u3bQpPHTVVSh7g5kysZl6mNK9ulM3Ycjl60P2gEaN%2FWZcnKaAFEJ0HIHkhy3cfLKLopC28IkSrZG%2BXsUH4Gr%2BYWRZmoyj9Ln0N5iq0JEuvXC8cHS4PpZZzL4dUqjnkqbyQKfyfyOuNxdlkueNYntbjjvIRQVnHc0eRc7xVuzP%2FYG8Nz%2BMsGM9geBZGmMjqy5Nbmgp4qh3xpPVEiollhIUehP9cEZJcR4D4HWHSF%2B6IDs7VU3cFA9YfybCJbxAGi%2Fnu%2FYB50nL8fBMtR3hvV%2FByxse3eHBA8Rex2xQeA88dvvHbGNHmmNvupeoUTqVUXUR3cpQ%2BS7hVrYS%2Bk0hO1rGURfgGCP1EygWgjYuUXtYEJEo73JlYh9ALQyy5ylVCTpC9dB9boCRKNSk5H0kY1yQJE6lpknBkEGTWQT3WpBxWnBbUKMSReDVfNo40DPpiwKqcg62a82ZTJGVvJUA%2FFYfU6bRRHBKKLT1yl%2B%2FBozKxRe1MbKlLo8Z9CdTO0ygljV%2F78XE8OT8%2Bys8mtaDRSO3aqNSWBFyAlqb52q%2FVxtve6xvVfHKt%2B2t1qPWHehzKhFoAljrvxb6fqrl9n%2BUr%2B59vOEHj9uHi8Z186%2F78cjn%2F8k19%2BrrZ3g97pD3%2FvXxo2gt8PJUPTWYn8aHauv3t3cVNbc3%2BBqvuxGjseRBkdE4dPsm3wBA%2FXIkfpyBzTiW%2B4QjguZxFmo6VdvHaMmk6ZZ0lfhMa3CR9TfwpOVoHvqzUQjNXVtYQAguZdy1vLdMkpDAHk6Tlhq2EOijJbiUJzsQBTskU2714WJNL%2FA%2Bv3hUw1wke5hW%2BlpNr%2FA8e98Mr18Fd6xYBLsII%2B4iCcFBMyG0ADdQZfDxrvSnggdBMr44PxoN6MtALlVi1ukyncQPNacmFWmGL1ncPkkQJ5iSUjnBFf4lCLwK2A%2FUrYINfFJgOvvnbqGI53fmeY7%2BiG0%2FDIhyJxTUm467jPzw9YCZNXnj%2B%2F0LfWjbg%2B3tkPyDoddBGZM2Uc6MKk7KQKh3BnWri%2BbRneC4VUN1HQWSHQT%2BR3cAEOo3i4DS3gELbOwJWDFVFirYWVLGh6WpE%2FjPR0Iw8G%2BMdPF2dmtdqfMKL35Iaz3XZuyUPD3aruSVnvd%2BStrvu6X7EIxvyvWi4to0l81RIPdIN2I%2BcQ%2FULwc%2BAzBO1Rf4ilnqVLqXerkx%2FXJjdK%2FXOO7PTVo67N6SnKPX6CFTJnlKfVsMumUixiFdkEWwsZ%2FczMHxHD9Dvn%2FQGVu5%2FhiTRaBEYI9o2utmap0Vi5rsqDbDMByv2eXHII%2FjGaINsD%2FnB6ObkRRJ2WViYfsWqfsXjxUQv3mywJX0wQqYrTBAmO0qNOI4O%2Feya1gpoFsEM8mAiChtREEJXmr4FRuosAyrHJZGnwLlpoYMQ40ewwcQgoLROkRyETJRGN1F2Yy%2BYeVvS6nxWj8HLSgMcXlzk7fREjF%2FR1uPPM3NjazGOcf029uGkeNtZnTFn8vUXnXS84h93zk%2FqBaU06B%2BxnBXy0ROmfhbJRavrKjGBGmwptQG7MaEJPFadRkeSmEjbWvo6Q%2BSz%2B1JykznYrTJuKzbyvLGQqtKtW%2BX%2B92P4aG23MlI%2Baab0dbwcXg6nz0XVODayOkN7yvSVHEURAL5xIqNpWa5UiHyj5LBAZIrUapyteKtMzxNCN82hMx9w6bjyocCnhdzVJp1BtASDoYJl1nDnoeAo2klMlKQ2E0nrwaQMZKudh9qhmPk5HE4yJ38FyRxLPQwf597O%2Fjslj63Q26hZp7Yzs%2FGQ45OMcMqzib%2FrQ650bSirPYmT4gSDG31Z2fjSOKnc89q4VWb2k7rMDgyYIi%2BRiCmw3mMNIGXVT9xrZ%2BRn6fq1dCKHMzLtWTKyfEjVRCnWg2iTkYnjwV9gOYimTBHTmvRuJveC3k3lrEljMt8jvOeeH8%2FboXdLfXsFxkRy5ExNamdgwkJqLy2t9Xdr3UaSJx6mwYZ5OEWbPkuKlhfNJ1q3ormQolUndv5fMhceTZCmYcJVbUswr00lLkzz2%2BcPl7Ts4R3esBLkiv%2FOBl6IiEa6Lr1uEh8J7mjAnSdJIUXmu7kh6wMfIGndBLN1k5gEiBn1Rq6MAcqWwW9H0MLTwrIdm0noknEfTqNm7dOo3LlBqwn8HRSPCML3luTvIB91je9o5A%2FsY%2Fh%2BCNtm0lQNk5xIpwhKfHVuzv3rJbr%2BmnOztVEm8tHmXDWLZi2Zc%2FmA%2B2bOXSMHXbrmDouFSm2R0OcKMBMlMbl%2BIFQbeoMPImcNrvU2bLnwSSiOiAejHEyIJ9KgdUJ8BmFxPOtWWBRb%2BP5631ej0mLtzNx%2BWvg0tZ8WPiysHWbcw9hHKtXS0DAeS8%2FKGxUE2cCzrTA8d9pkSv7k8zlc5uwgeTInJVaJmmXiabkIegaZU5O6lTkrTVy9CYAtxt6Hvm7c9zT%2BlVqtYMpskMxC%2F2AFUNEsoEfk9j7VpAFszwfqjQUmprNF4gstHz2qnNB%2FCWFWW0JoWkA4KX6fD%2Fso8nX2gso3CGLnHAN4%2BTvkuJlzSiqkhO7pGpVPSKEHDAG8JZ6I%2B3GdmUQ%2B0n2F9DDyc6fxkW%2Bsw1XbMd2FKO2kDjAL0LZdmowYkCkCTlcVWx4cWQ64weq%2BTcgkSoZIT%2BW60dRSA1RavIOLpSEKC3fewr0xDXm5hXvFKnaPHDIxzzzIISO3yEDngiAacYJgvzLg5uV1H3p%2FplwSaRMnZxFMqlYVXtr5a3RJssUpBuT8LbxT2flb1GOWy3HjKUz3mJXDWUj8GvN2Urs%2FrlNA%2FWk6QTZRJWdq19DxW7vACsg6Iy84%2FbScc53k8%2F%2BBnX5Q0HM492fOSxrE4lCxZpI8E7D22fR0aix0AT7%2F7PH97rs6x6%2Fy02rrqaNd8c54mJ1Yzz44dbLH9cfc4WE9sp7xuIy4lswfy%2FOQ%2BbpqZK3oio6%2BpaG3mIc9gE5IjpWnpj1PZ4dWIibC1M3NbVPXdV1bHt3IIxpBsv7T51EqR4zyWZtKc0q4ILq4ZR2c1%2BPtsRMA6odmEuF7RMaW%2BjYhYi%2FC1J9NWlTnNUtNnq%2FoTqdZXU3Y%2BqdtGvu5Y7KDGpWNhANMp4eFA0xnsxyKnSnhB2qf3LACLHVDAhi0QNag1mkiZWDKc2%2B6j84gqUkCE2oh14fVb4E5HBwGME%2Bjx%2FOJeMp76qfjolm8TU%2F9e%2BnjanM9C%2B2Hzc3y%2Bv3t4uef98NyR30BuYAPCPlc3mY1FrGzT2gNom2CvLS7AjtblrIzVrU5BSRhheV0VAZrKhR6Li3sLMKKLEVvhNNlCYlAVBvLFab%2FUxidEAmKpxwuYX8PUodbxgU5oa8VFUxsolTV8KoctxubWOlZdgvynZXegWPBSvPakY2v9CmOZS0vEqcsvnyFk0ay1iJtKVMxKlWJKgFY56LvkfLuIba%2FMyCXzGsE7cMudrBP89jVptOzEr1frs9TuCzP0BnGbO8L6tap2I4vze9VuRgv1ovT24F16F5qgMXMtWzmz0SQM8EjJDMcRjkTh%2BmyzL1FPRMLN2fa62Wg2wsw2OWRUyz%2FiLBTOxN2HlrxHSTfIR8ZQVHKkyp8DHV6EXfyIcSANNIR54k3grwWBSQfjaF5sUhrAZfLhIfCgDTxiN7QxEli%2F0E0XtGEoxSAd4sCLkh8RTXrL4yxEt9P1%2BjZnhBvnP3K%2FMb1rT94fnr8%2FVjGUJvZJAUlYVzcJMpsXtwkZ7Nq%2F%2FX5wEWbdrow7rratpC2cv%2BY2x%2FvrA9vl7e3T%2B7O3X388eXbsK6Rm5sQuq7ykjtdcpo9VXbv87ymx7mN3OyQCWKQ%2BATHYtQu9eJ6OwKP4B4%2BQU7UoPUZYjmBNXJx4YwW760VgA6iO8iN4JrNiljTYru3IrR8lyPbszN851N9px2XbBSegNrxMXIHEUJ88ZVUSibW64aPRT2eFAqSe6pIZvuhyULAF21izxPwchbwyplcvXUhX%2BkSbh%2F0wjjIjqu0ngv0x0L%2B0KDKuqggXPu2QiqrBlnLMB759u4S8m5hOvu4ZYId9Cqkqbnqm%2BH8wD1Xn72Op2ruUNaxoO6Qohb5qzptwPMgrhspdbu1zn4qa1UVtP26QGd1jIXD7lMd4x6f%2Fn4ayM9zMMnBMU6T3LklMyWNPnufV7STDn%2BvZ5GY9Acdz3K2XV1EElNWpReIlLcLqPuqxeafnyinIFJtO8JVFOqm7oW17Qe8JV2Y0Udb9wGsoKQrYpVluWyspk7ccRflHA02xao4unKUe3bWhHwYndrHKrN8l%2FY41j3EysS2r8Hu8eCklxf2nq1woyoyl%2B67C3wXBazkVjm1CoQeX8ZBJ6nCVwvyR8g508ut5eWzrFqVq7P1VoO%2FgxJnUb5gF2vGVM0yGBtfswMoCUvf6CbBAimrvzUU6jmf5tigKAJxKnD%2FqHzJGwcuZ8yl0UgHxBKStyI732LHns1%2F8m5EcoCS7dL8bohDxzwtQo5BE%2BNYkU6bnj5asfvtQiZX0nIHtIT0d0Xoj%2BfRzmm%2BePbIeILT%2BFcQZl2zVf0X5pSfkkGhwGbBvvvPFSVrmIKT4Rw5uWvX3%2BLP%2F9Ez2YPx2X0SCaGq19UHE3IQV8lKsIR681cUhOSIQZqCXzhjFE97BRuULuuD7lsQSUNSLS3HsEfJEhg7g0SCeRs9QK9rjusNySyV0BNeVJ%2BPjRUJ%2BHp1QcEaDm%2Bv38Zj%2Fvb54msChVpfubKjgMOC%2Bossuh67ZPhBtBx6rhfZvKLPIeu6o8zvPi4ym9Rx46tDh1Cz2wvHAfKE9g7vYJx6Y61IRSkoUkQX3kdBwHEiLq9AyjFAkMKQFGIi3JqNBAi%2Fd%2BgAcGORdOTDCnJc56%2FJAZhP8vpRkTEImb7aRBaA0ELbIzX8SKtQI2e%2FdFApfDzOWl20ec5OW79SeNYw3FalcD7gvlUK%2FxzZoYU5b%2B2suWiLSTZhoTHDRu0diFUzSW6vf6VV5b5A82rtmLJqtjOBfNz1CQpyp7ECXdb8PNRDKFw%2BRtt6U3SlapD9rPj5NZ2l5aysdeQXJNIeGXNo3U7%2BBZDlNz5aZT5QIKGYio982xs5kIFTSkaHCSk1qXa1oAsC7fDYiF4mVTZTySFM608aMdYPQz9yIOTVLE5D3CFfW73Oep9%2FdbMG6HZXHKsJ%2BhY0v0aWW9Bb1VqLJp4vHdv2gpDjMxtZjFxPPUC6OAK%2BpIHqhMC5Xf%2FedokdZ4u1xzXRf1PjFemB5d0e0YCVV8KSggO%2B%2B6q5AeKF0LHYCQyHjYCD3I78TFYbCJgklxor9oMk4F8K9n8EUBQr6sjAYoOftdW5BmjxKQDWmtfr2JYiuptUOY4TwuPtGr%2BIBcGAFT6mDelSyLSWY2JMoAiOf3x2A2poChHYG%2B4RAvxNFjFA5KyKpK5jyUxa3QZnTEw2fde746ISDK48Nf40yVvJCN4TVZSKK6oXNW0ge0csZosM07mVPCl9ec96lcuFpYv43NOX8SVmMWH6cYxgm8%2BuCSr12%2F8B). Specify desired analysis details for your data in the respective *essential.vars.groovy* file (see below) and run the selected pipeline *marsseq.pipeline.groovy* or *smartsseq.pipeline.groovy* as described [here](https://gitlab.rlp.net/imbforge/NGSpipe2go/-/blob/master/README.md). The analysis allows further parameter fine-tuning subsequent the initial analysis e.g. for plotting and QC thresholding. Therefore, a customisable *sc.report.Rmd* file will be generated in the output reports folder after running the pipeline. Go through the steps and modify the default settings where appropriate. Subsequently, the *sc.report.Rmd* file can be converted to a final html report using the *knitr* R-package. ### The pipelines includes: - FastQC, MultiQC and other tools for rawdata quality control - Adapter trimming with Cutadapt - Mapping to the genome using STAR - generation of bigWig tracks for visualisation of alignment - Quantification with featureCounts (Subread) and UMI-tools (if UMIs are used for deduplication) - Downstream analysis in R using a pre-designed markdown report file (*sc.report.Rmd*). Modify this file to fit your custom parameter and thresholds and render it to your final html report. The Rmd file uses, among others, the following tools and methods: - QC: the [scater](http://bioconductor.org/packages/release/bioc/html/scater.html) package. - Normalization: the [scran](http://bioconductor.org/packages/release/bioc/html/scran.html) package. - Differential expression analysis: the [scde](http://bioconductor.org/packages/release/bioc/html/scde.html) package. - Trajectory analysis (pseudotime): the [monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html) package. ### Pipeline parameter settings - essential.vars.groovy: essential parameter describing the experiment - project folder name - reference genome - experiment design - adapter sequence, etc. - additional (more specialized) parameter can be given in the var.groovy-files of the individual pipeline modules - targets.txt: comma-separated txt-file giving information about the analysed samples. The following columns are required - sample: sample identifier. Must be a unique substring of the input sample file name (e.g. common prefixes and suffixes may be removed). These names are grebbed against the count file names to merge targets.txt to the count data. - plate: plate ID (number) - row: plate row (letter) - col: late column (number) - cells: 0c/1c/10c (control wells) - group: default variable for cell grouping (e.g. by condition) for pool-based libraries like MARSseq required additionally: - pool: the pool ID comprises all cells from 1 library pool (i.e. a set of unique cell barcodes; the cell barcodes are re-used in other pools). Must be a unique substring of the input sample file name. For pool-based design, the pool ID is grebbed against the respective count data filename instead of the sample name as stated above. - barcode: cell barcodes used as cell identifier in the count files. After merging the count data with targets.txt, the barcodes are replaced with sample IDs given in the sample column (i.e. here, sample names need not be a substring of input sample file name). ### Programs required - FastQC - STAR - Samtools - Bedtools - Subread - Picard - UCSC utilities - RSeQC - UMI-tools - R ## Resources - QC: the [scater](http://bioconductor.org/packages/release/bioc/html/scater.html) package. - Normalization: the [scran](http://bioconductor.org/packages/release/bioc/html/scran.html) package. - Trajectory analysis (pseudotime): the [monocle](https://bioconductor.org/packages/release/bioc/html/monocle.html) package. - A [tutorial](https://scrnaseq-course.cog.sanger.ac.uk/website/index.html) from Hemberg lab - Luecken and Theis 2019 [Current best practices in single‐cell RNA‐seq analysis: a tutorial](https://www.embopress.org/doi/10.15252/msb.20188746) """ ; ns1:keywords "scRNA-seq, smart-seq 2, bpipe, groovy" ; ns1:license ; ns1:name "scRNA-seq Smart-seq 2" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bin_widths" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cells_per_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "exclude" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "inputReads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metadataSchema" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "subjects" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "colinear_components" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "indexed_paths" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mergedMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "odgiGraph" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "odgiPNG" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "odgiRDF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "readsMergeDedup" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "seqwishGFA" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "K-mer length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maternal reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Hifi reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paternal reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Ploidy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope Model Parameters (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope linear plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope linear plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope linear plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope log plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope log plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope log plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope summary (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed linear plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed linear plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed linear plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed log plot (child)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed log plot (maternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenomeScope transformed log plot (paternal)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl database : Child" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl database : maternal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl database : paternal" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-05-10T02:02:12Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:31Z"^^ns1:Date ; ns1:description "Create Meryl Database used for the estimation of assembly parameters and quality control with Merqury. Part of the VGP pipeline." ; ns1:input , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "kmer-profiling-hifi-trio-VGP2/main" ; ns1:output , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 6 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-03-10T03:01:43Z"^^ns1:Date ; ns1:dateModified "2026-03-10T03:01:44Z"^^ns1:Date ; ns1:description "Evaluation of Pacbio Hifi Reads and genome profiling. Create Meryl Database used for the estimation of assembly parameters and quality control with Merqury. Part of the VGP pipeline." ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "kmer-profiling-hifi-VGP1/main" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 14 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database for Busco Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size - Parameter File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope model parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection - Trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "What sequences are you purging?" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly statistics for purged assemblies" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged alternate/hap2 assembly: Full Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged alternate/hap2 assembly: Full Table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged alternate/hap2 assembly: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged alternate/hap2 assembly: Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged alternate/hap2 assembly: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged primary/hap1 assembly: Full Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged primary/hap1 assembly: Full Table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged primary/hap1 assembly: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged primary/hap1 assembly: Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged primary/hap1 assembly: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cutoffs for alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cutoffs for primary assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated Genome Size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury on Phased assemblies: Images" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury on Phased assemblies: stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name mapping Alternate assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nx Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged Alternate Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged Alternate assembly (gfa)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged Primary Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged Primary Assembly (gfa)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read Coverage and cutoffs calculation on alternate assembly: Histogram Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read Coverage and cutoffs calculation on primary assembly: Histogram plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Size Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clean_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_QV" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_01.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_02.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-asm.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-cn.fl" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-01-29T03:01:41Z"^^ns1:Date ; ns1:dateModified "2026-01-29T03:01:43Z"^^ns1:Date ; ns1:description "Purge contigs marked as duplicates by purge_dups. " ; ns1:input , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Purge-duplicate-contigs-VGP6/main" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 24 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-10-31T10:42:03Z"^^ns1:Date ; ns1:dateModified "2024-09-09T08:04:38Z"^^ns1:Date ; ns1:description """| tool | version | license | | -- | -- | -- | | abritAMR | 1.0.14 | [CC-BY-4.0](https://zenodo.org/records/12514579) | """ ; ns1:keywords "" ; ns1:license ; ns1:name "Workflow 1: AbritAMR" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-11-02T11:09:12Z"^^ns1:Date ; ns1:dateModified "2023-11-24T08:41:20Z"^^ns1:Date ; ns1:description """**Name:** Word Count **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Wordcount is an application that counts the number of words for a given set of files. To allow parallelism every file is treated separately and merged afterwards. # Execution instructions Usage: ``` runcompss --lang=python src/wordcount.py datasetPath ``` where: * datasetPath: Absolute path of the file to parse (e.g. /home/compss/tutorial_apps/python/wordcount/data/) # Execution Examples ``` runcompss --lang=python src/wordcount.py $(pwd)/data/ runcompss src/wordcount.py $(pwd)/data/ python -m pycompss src/wordcount.py $(pwd)/data/ ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.635.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Tutorial, Example, Marenostrum IV, Supercomputer, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Wordcount test, using files as task inputs, and dictionaries as task outputs (executed at Marenostrum IV supercomputer)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-02T11:59:44Z"^^ns1:Date ; ns1:dateModified "2023-11-14T11:58:44Z"^^ns1:Date ; ns1:description """# ![sanger-tol/insdcdownload](docs/images/sanger-tol-insdcdownload_logo.png) [![GitHub Actions CI Status](https://github.com/sanger-tol/insdcdownload/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/insdcdownload/actions?query=workflow%3A%22nf-core+CI%22) [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7155119-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7155119) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Get help on Slack](http://img.shields.io/badge/slack-SangerTreeofLife%20%23pipelines-4A154B?labelColor=000000&logo=slack)](https://SangerTreeofLife.slack.com/channels/pipelines) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40sangertol-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/sangertol) [![Watch on YouTube](http://img.shields.io/badge/youtube-tree--of--life-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/channel/UCFeDpvjU58SA9V0ycRXejhA) ## Introduction **sanger-tol/insdcdownload** is a pipeline that downloads assemblies from INSDC into a Tree of Life directory structure. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! On release, automated continuous integration tests run the pipeline on a full-sized dataset on the GitHub CI infrastructure. This ensures that the pipeline runs in a third-party environment, and has sensible resource allocation defaults set to run on real-world datasets. ## Pipeline summary ## Overview The pipeline takes an assembly accession number, as well as the assembly name, and downloads it. It also builds a set of common indices (such as `samtools faidx`), and extracts the repeat-masking performed by the NCBI. Steps involved: - Download from the NCBI the genomic sequence (Fasta) and the assembly stats and reports files. - Turn the masked Fasta file into an unmasked one. - Compress and index all Fasta files with `bgzip`, `samtools faidx`, and `samtools dict`. - Generate the `.sizes` file usually required for conversion of data files to UCSC's "big" formats, e.g. bigBed. - Extract the coordinates of the masked regions into a BED file. - Compress and index the BED file with `bgzip` and `tabix`. ## Quick Start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.04.0`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash nextflow run sanger-tol/insdcdownload -profile test,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! ```console nextflow run sanger-tol/insdcdownload --assembly_accession GCA_927399515.1 --assembly_name gfLaeSulp1.1 --outdir results ``` ## Documentation The sanger-tol/insdcdownload pipeline comes with documentation about the pipeline [usage](docs/usage.md) and [output](docs/output.md). ## Credits sanger-tol/insdcdownload was mainly written by @muffato, with major borrowings from @priyanka-surana's [read-mapping](https://github.com/sanger-tol/readmapping) pipeline, e.g. the script to remove the repeat-masking, and the overall structure and layout of the sub-workflows. ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#pipelines` channel](https://sangertreeoflife.slack.com/channels/pipelines). Please [create an issue](https://github.com/sanger-tol/insdcdownload/issues/new/choose) on GitHub if you are not on the Sanger slack channel. ## Citations If you use sanger-tol/insdcdownload for your analysis, please cite it using the following doi: [10.5281/zenodo.7155119](https://doi.org/10.5281/zenodo.7155119) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:keywords "Bioinformatics" ; ns1:license ; ns1:name "sanger-tol/insdcdownload v1.1.0 - Deciduous ent" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-11-03T08:54:32Z"^^ns1:Date ; ns1:dateModified "2024-08-02T14:54:52Z"^^ns1:Date ; ns1:description """**Name:** TruncatedSVD (Randomized SVD) **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs **Machine**: MareNostrum4 TruncatedSVD (Randomized SVD) for computing just 456 singular values out of a (3.6M x 1200) size matrix. The input matrix represents a CFD transient simulation of aire moving past a cylinder. This application used [dislib-0.9.0](https://github.com/bsc-wdc/dislib/tree/release-0.9) """ ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Machine Learning, PyCOMPSs, dislib, SVD, Marenostrum IV, Supercomputer, non_data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Randomized SVD" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , ; ns1:dateCreated "2020-10-07T09:36:13Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:44:54Z"^^ns1:Date ; ns1:description "" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.63.1" ; ns1:image ; ns1:input , , , , , , ; ns1:keywords "covid-19, CWL, pangenome" ; ns1:license ; ns1:name "COVID-19 PubSeq Pangenome Generate" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bam file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BUSCO dataset" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Output destination (not used in the workflow itself)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gtdbtk data directory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Identifier used" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "memory usage (MB)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run SemiBin" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SemiBin Environment" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CWL base step number" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sub workflow Run" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bin files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly/Bin read stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bins summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BUSCO" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CheckM" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "DAS Tool" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EukRep fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "EukRep stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTDB-Tk" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MaxBin2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MetaBAT2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SemiBin" . a ns1:ComputerLanguage ; ns1:alternateName "AS" ; ns1:identifier ; ns1:name "Autosubmit" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-11-03T22:59:42Z"^^ns1:Date ; ns1:dateModified "2023-11-03T22:59:42Z"^^ns1:Date ; ns1:description "Autosubmit mHM test domains" ; ns1:keywords "" ; ns1:license ; ns1:name "Autosubmit mHM test domains" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2026-03-13T03:02:01Z"^^ns1:Date ; ns1:dateModified "2026-03-13T03:02:01Z"^^ns1:Date ; ns1:description "Assemble Genome using PacBio HiFi and HiC data from the same individual for phasing. Prerequisite: Run k-mer profiling workflow (VGP1). This workflow uses HiFiasm for contigging, and generates assembly statistics, BUSCO reports, Merqury histograms, and the genome assembly contigs in fasta and GFA format." ; ns1:isBasedOn ; ns1:keywords "vgp, Reviewed" ; ns1:license ; ns1:name "Assembly-Hifi-HiC-phasing-VGP4/main" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 33 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bits for bloom filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database for Busco Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope Model Parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hapmer Database: Maternal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hapmer Database: Paternal" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Homozygous Read Coverage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maternal Illumina reads (hap2)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database: Child" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name for Haplotype 1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name for Haplotype 2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection: child" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paternal Illumina reads (hap1)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Utilize homology information to correct trio-phasing errors" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly statistics for Hap1 and Hap2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Gff Hap1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Gff Hap2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Summary Hap1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Summary Hap2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Summary Image Hap1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Busco Summary Image Hap2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap1 (paternal) contigs: Full table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap1 (paternal) contigs: Full table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap1 (paternal) contigs: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap1 (paternal) contigs: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap2 (maternal) contigs: Full table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap2 (maternal) contigs: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap2 (maternal) contigs: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on Hap2 (maternal) contigs: Full table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated Genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Trio hap1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Hifiasm Trio hap2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury Images" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury QV" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury Trio Histogram" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "No Sequence hap1 gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "No Sequence hap2 gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nx Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Size Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clean_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cutadapt multiqc stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "json_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_qv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "multiqc html report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_01.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_02.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-asm.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "raw unitig graph image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "usable hap1 gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "usable hap2 gfa" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-10-01T02:01:59Z"^^ns1:Date ; ns1:dateModified "2025-11-27T14:15:46Z"^^ns1:Date ; ns1:description "Generate phased assembly based on PacBio HiFi reads and parental Illumina data for phasing. Part of the VGP workflow suite, it needs to be run after the Trio k-mer Profiling workflow VGP2. This workflow uses HiFiasm for contigging, and generates assembly statistics, BUSCO reports, Merqury plots, and the genome assembly contigs in fasta and GFA format. " ; ns1:input , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Assembly-Hifi-Trio-phasing-VGP5/main" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 25 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bionano Data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Conflict resolution files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size - Parameter File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input GFA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Statistics for s1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bionano scaffolds reconciliated: fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bionano scaffolds reconciliated: gfa" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nx Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Scaffolds: agp" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Size Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clean_stats" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-10-07T15:34:25Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:37Z"^^ns1:Date ; ns1:description """# Scaffolding with Bionano Scaffolding using Bionano optical map data ## Inputs 1. Bionano data [cmap] 2. Estimated genome size [txt] 3. Phased assembly generated by Hifiasm [gfa1] ## Outputs 1. Scaffolds 2. Non-scaffolded contigs 3. QC: Assembly statistics 4. QC: Nx plot 5. QC: Size plot""" ; ns1:input , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Scaffolding-Bionano-VGP7/main" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 5 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Report - STEC pipeline (blast) on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Summary - STEC pipeline (blast) on input dataset(s)" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-11-07T17:29:22Z"^^ns1:Date ; ns1:dateModified "2024-09-09T08:16:30Z"^^ns1:Date ; ns1:description """| database | database version | | -- | -- | | ResFinder | 2022-07-19 | | CARD | 2023-12-03 |""" ; ns1:keywords "" ; ns1:license ; ns1:name "Workflow 2: Sciensano" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Nadolina Brajuka" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database for NCBI FCS GX" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Haplotype" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum length of sequence to consider for mitochondrial scaffolds" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Scaffolded assembly (fasta)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Binomial Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Taxonomic Identifier" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Adaptor Report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Info" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Contaminants report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Contaminants sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Final Decontaminated Assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Masking Action Report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mitochondrial Scaffolds" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Taxonomy Report" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-02-17T03:01:37Z"^^ns1:Date ; ns1:dateModified "2026-02-17T03:01:37Z"^^ns1:Date ; ns1:description "Decontamination (foreign contaminants and mitochondrial sequences) of a genome assembly after the final scaffolding step. Uses NCBI FCS GX to identify foreign contaminants and Blast to identify mitochondrial sequences. Part of the VGP Suite." ; ns1:input , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Assembly-decontamination-VGP9/main" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 17 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Fastqs for one sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reference genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Sample" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "AnnData Loaded" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2024-05-30T05:49:49Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:56:18Z"^^ns1:Date ; ns1:description """Takes fastqs and reference data, to produce a single cell counts matrix into and save in annData format - adding a column called sample with the sample name. """ ; ns1:input , , ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq: Count and Load with Cell Ranger" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Sarah Williams" . a ns1:Person ; ns1:name "Valentine Murigneaux" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-11-09T01:30:26Z"^^ns1:Date ; ns1:dateModified "2024-05-30T05:56:52Z"^^ns1:Date ; ns1:description """From the R1 and R2 fastq files of a single samples, make a scRNAseq counts matrix, and perform basic QC with scanpy. Then, do further processing by making a UMAP and clustering. Produces a processed AnnData Depreciated: use individual workflows insead for multiple samples""" ; ns1:isPartOf ; ns1:keywords "scRNAseq" ; ns1:license ; ns1:name "scRNAseq Single Sample Processing Cell Ranger" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:Person ; ns1:name "Saskia Hiltemann" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Groups" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:04:17Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:13:16Z"^^ns1:Date ; ns1:description """16S Microbial Analysis with mothur (short) The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for piepline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input , ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 1: Further Quality Control [16S Microbial Analysis With Mothur]" ; ns1:output , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Aligned Sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Count Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Summary.seqs on input dataset(s): summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:11:57Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:11:57Z"^^ns1:Date ; ns1:description """The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for piepline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input , ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 2: Data Cleaning And Chimera Removal [16S Microbial Analysis With Mothur]" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-10-03T10:18:05Z"^^ns1:Date ; ns1:dateModified "2025-10-03T10:33:50Z"^^ns1:Date ; ns1:description """ **Workflow for Metagenomics binning from assembly.
** Minimal inputs are: Identifier, assembly (fasta) and an associated sorted BAM file Summary
- MetaBAT2 (binning)
- MaxBin2 (binning)
- SemiBin2 (binning)
- Binette (bin merging)
- EukRep (eukaryotic classification)
- CheckM2 (bin completeness and contamination)
- BUSCO (bin completeness)
- GTDB-Tk (bin taxonomic classification)
- CoverM (bin abundances)
Including:
**Bin annotation (workflow: https://workflowhub.eu/workflows/1170):**
- Bakta
- Interproscan
- Eggnog
- KOfamscan
- To RDF conversion with SAPP (optional, default on) --> https://workflowhub.eu/workflows/1174/
Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default
**All tool CWL files and other workflows can be found here:**
https://gitlab.com/m-unlock/cwl
**How to setup and use an UNLOCK workflow:**
https://docs.m-unlock.nl/docs/workflows/setup.html
""" ; ns1:image ; ns1:input , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Metagenomics, microbial, metagenome, binning" ; ns1:license ; ns1:name "Metagenomic Binning from Assembly" ; ns1:output , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 12 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CLM-FATES restart file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input dataset for CLM-FATES" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Cleaned Sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Count Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Training set FASTA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Training set Taxonomy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:16:29Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:16:29Z"^^ns1:Date ; ns1:description """The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for pipeline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input , , , ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 3: Classification [Galaxy Training: 16S Microbial Analysis With Mothur]" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Count Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Taxonomy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:20:20Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:20:20Z"^^ns1:Date ; ns1:description """The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for pipeline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 5: OTU Clustering [16S Microbial Analysis With Mothur]" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Shared file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:24:50Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:24:50Z"^^ns1:Date ; ns1:description """The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for pipeline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 6: Alpha Diversity [16S Microbial Analysis With Mothur]" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Ahmed Mehdi" . a ns1:Person ; ns1:name "Bérénice Batut" . a ns1:Person ; ns1:name "Dave Clements" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Shared file from Make.shared" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-11-09T05:26:53Z"^^ns1:Date ; ns1:dateModified "2023-11-09T05:26:53Z"^^ns1:Date ; ns1:description """The workflows in this collection are from the '16S Microbial Analysis with mothur' tutorial for analysis of 16S data (Saskia Hiltemann, Bérénice Batut, Dave Clements), adapted for pipeline use on galaxy australia (Ahmed Mehdi). The workflows developed in galaxy use mothur software package developed by Schloss et al https://pubmed.ncbi.nlm.nih.gov/19801464/. Please also refer to the 16S tutorials available at Galaxy https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop-short/tutorial.html and [https://training.galaxyproject.org/training-material/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.html """ ; ns1:input ; ns1:isPartOf ; ns1:keywords "Metagenomics" ; ns1:license ; ns1:name "Workflow 7 : Beta Diversity [16S Microbial Analysis With Mothur]" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-11-23T13:28:50Z"^^ns1:Date ; ns1:dateModified "2026-02-23T13:05:00Z"^^ns1:Date ; ns1:description """# ANNOTATO - Annotation workflow To Annotate Them Oll - [ANNOTATO - Annotation workflow To Annotate Them Oll](#annotato---annotation-workflow-to-annotate-them-oll) - [Overview of the workflow](#overview-of-the-workflow) - [Input data](#input-data) - [Pipeline steps](#pipeline-steps) - [Output data](#output-data) - [Prerequisites](#prerequisites) - [Installation](#installation) - [Running ANNOTATO](#running-annotato) - [Before running the pipeline (IMPORTANT)](#before-running-the-pipeline-important) - [Without RNASeq and protein data](#without-rnaseq-and-protein-data) - [Running ANNOTATO with RNASeq data](#running-annotato-with-rnaseq-data) - [Running ANNOTATO with protein data](#running-annotato-with-protein-data) - [Running ANNOTATO with both protein and RNASeq data](#running-annotato-with-both-protein-and-rnaseq-data) - [Running ANNOTATO with params.json](#running-annotato-with-paramsjson) - [Other parameters for running the analysis](#other-parameters-for-running-the-analysis) - [Evaluating output GFF to the exon level](#evaluating-output-gff-to-the-exon-level) - [Performance of the workflow on annotating difference eukaryote genomes](#performance-of-the-workflow-on-annotating-difference-eukaryote-genomes) - [Future work](#future-work) ## Overview of the workflow The pipeline is based on `Funannotate` or `BRAKER` and was initially developed and tested on the two datasets: - Drosophila melanogaster: [https://doi.org/10.5281/zenodo.8013373](https://doi.org/10.5281/zenodo.8013373) - *Pocillopora* cf. *effusa*: [https://www.ncbi.nlm.nih.gov/biosample/26809107](https://www.ncbi.nlm.nih.gov/biosample/26809107) Then, it was further tested on these species during the [BioHackathon 2023 - project 20](https://github.com/elixir-europe/biohackathon-projects-2023/tree/main/20) - Helleia helle - Homo sapiens chrom 19 - Melampus jaumei - Phakellia ventilabrum - Trifolium dubium ### Input data - Reference genome `genome.[.fna, .fa, .fasta][.gz]` - RNAseq data listed in a metadata csv file. Input type can be mixed between long and short reads, with the option of single-end read. The input file should follow the format below: ``` sample_id,R1_path,R2_path,read_type SAM1,/path/to/R1,,long # For long reads SAM2,/path/to/R1,/path/to/R2,short # For PE reads SAM3,/path/to/R1,,short # For SE reads ``` - Protein sequence data in fasta format, could be gzip or not ### Pipeline steps ![Pipeline](./assets/images/annotato-workflow.drawio.svg) The main pipeline is divided into five different subworkflows. - `Preprocess RNA` is where the input RNASeq data are QC and trimmed. - `Process RNA Minimap` is triggered when long reads FastQ are in the input CSV file. - `Process RNA STAR` will run when short reads FastQ are in the input CSV. - `Genome Masking` runs by default if not skipped. It assumes the input genome fasta is not masked and will run Denovo repeat masking with RepeatModeler and RepeatMasker. - `Filter Repeat` whenever there is a Denovo masking step, this sub-workflow will be triggered to remove the repeat sequences that appeared in the Uniprot Swissprot protein data. ### Output data - MultiQC report for the RNASeq data, before and after trimming, mapping rate of short reads, and the BUSCO results of predicted genes. - RepeatMasker report containing quantity of masked sequence and distribution among TE families - Protein-coding gene annotation file in gff3 format - BUSCO summary of annotated sequences ## Prerequisites The following programs are required to run the workflow and the listed version were tested. `nextflow v23.04.0 or higher` `singularity` `conda` and `mamba` (currently, having problem with Funannotate and BRAKER installation) `docker` (have not been tested but in theory should work fine) ## Installation Simply get the code from github or workflowhub and directly use it for the analysis with `nextflow`. ``` git clone https://github.com/ERGA-consortium/pipelines/tree/main/annotation/nextflow ``` ## Running ANNOTATO ### Before running the pipeline (IMPORTANT) One thing with Nextflow is that it is running off a Java Virtual Machine (JVM), and it will try to use all available memory for Nextflow even though it is unnecessary (for workflow management and job control). This will cause much trouble if you run a job on an HPC cluster. Thus, to minimize the effect of it, we need to limit the maximum memory the JVM can use. ``` export NFX_OPTS="-Xms=512m -Xmx=3g" ``` `-Xms` is the lower limit, which is set as 512 MB. `-Xmx` is the upper limit, which in this case is set as 3 GB. Please modify this according to your situation. ### Without RNASeq and protein data Perform the analysis with only the draft genome and busco database. ``` nextflow run main.nf --genome /path/to/genome.fasta --species "Abc def" --buscodb 'metazoa' ``` The workflow will run Denovo repeat masking on the draft genome, then softmask the repeat region and use the genome to run `funannotate`. Add `--run_braker` to run the genome prediction using `BRAKER` instead. ### Running ANNOTATO with RNASeq data When you want to let the workflow run the mapping by itself, uses `input.csv` as input with the link to all `FASTQ` file. ``` nextflow run main.nf --genome /path/to/genome.fasta[.gz] --rnaseq /path/to/input.csv --species "Abc def" --buscodb 'metazoa' ``` Based on the content of the `input.csv` file to trigger different RNASeq processing workflows. The output `bam` file will then be used for genome prediction. When reads are mapped to the reference genome, the aligned `bam` file can be used as input to the pipeline instead of the raw `FASTQ` ``` nextflow run main.nf --genome /path/to/genome.fasta[.gz] --short_rna_bam /path/to/shortreads.bam [--long_rna_bam /path/to/longreads.bam] --species "Abc def" --buscodb 'metazoa' ``` **ATTENTION**: One major drawback of the current workflow is that the input genome will be sorted and renamed by the `funannotate sort` function. This is because `AUGUSTUS` and `Funannotate` won't work normally when the header of the input genome is too long and contains weird characters. Therefore, if you want to provide a `bam` file as input instead of the raw `FASTQ`, please run `funannotate sort` on the genome fasta first and then use it as the reference for running alignment. Or in case your genome headers are already shorter than 16 character, please add `--skip_rename` when running the pipeline. ### Running ANNOTATO with protein data ``` nextflow run main.nf --genome /path/to/genome.fasta[.gz] --protein /path/to/protein.fasta[.gz] --species "Abc def" --buscodb 'metazoa' ``` When only protein data is provided, the workflow will run denovo masking then repeat filter with the additional protein data. The masked genome and protein fasta will then be used for gene prediction. ### Running ANNOTATO with both protein and RNASeq data The full pipeline is triggered when both RNASeq data and protein fasta is provided. ``` nextflow run main.nf --genome /path/to/genome.fasta[.gz] --protein /path/to/protein.fasta[.gz] --rnaseq /path/to/input.csv --species "Abc def" --buscodb 'metazoa' ``` ### Running ANNOTATO with params.json One plus side with Nextflow is that it can use a parameter JSON file called `params.json` to start the analysis pipeline with all required parameters. Please modify the content of the `params.json` according to your need then run the following command. ``` nextflow run main.nf -params-file params.json ``` ### Other parameters for running the analysis ``` Compulsory input: --genome Draft genome fasta file contain the assembled contigs/scaffolds --species Species name for the annotation pipeline, e.g. "Drosophila melanogaster" Optional input: --protein Fasta file containing known protein sequences used as an additional information for gene prediction pipeline. Ideally this should come from the same species and/or closely related species. [default: null] --rnaseq A CSV file following the pattern: sample_id,R1_path,R2_path,read_type. This could be generated using gen_input.py. Run `python gen_input.py --help` for more information. [default: null] --long_rna_bam A BAM file for the alignment of long reads (if any) to the draft genome. Noted that the header of the draft genome need to be renamed first before alignment otherwise it will causes trouble for AUGUSTUS and funannotate. [default: null] --short_rna_bam A BAM file for the alignment of short reads (if any) to the draft genome. Noted that the header of the draft genome need to be renamed first before alignment otherwise it will causes trouble for AUGUSTUS and funannotate. [default: null] --knownrepeat Fasta file containing known repeat sequences of the species, this will be used directly for masking (if --skip_denovo_masking) or in combination with the denovo masking. [default: null] Output option: --outdir Output directory. --tracedir Pipeline information. --publish_dir_mode Option for nextflow to move data to the output directory. [default: copy] --tmpdir Database directory. Funannotate params: --run_funannotate Whether to use funannotate for gene prediction. [default: true] --organism Fungal-specific option. Should be change to "fungus" if the annotated organism is fungal. [default: other] --ploidy Set the ploidy for gene prediction, in case of haploid, a cleaning step will be performed by funannotate to remove duplicated contigs/scaffold. [default: 2] --buscodb BUSCO database used for AUGUSTUS training and evaluation. [default: eukaryota] --buscoseed AUGUSTUS pre-trained species to start BUSCO. Will be override if rnaseq data is provided. [default: null] Braker params: --run_braker Whether to use BRAKER for gene prediction. [default: false] Skipping options: --skip_rename Skip renaming genome fasta file by funannotate sort. --skip_all_masking Skip all masking processes, please be sure that your --genome input is soft-masked before triggering this parameter. [default: false] --skip_denovo_masking Skip denovo masking using RepeatModeler, this option can only be run when --knownrepeat fasta is provided. [default: false] --skip_functional_annotation Skip functional annotation step. [default: false] --skip_read_preprocessing Skip RNASeq preprocessing step. [default: false] Execution/Engine profiles: The pipeline supports profiles to run via different Executers and Engines e.g.: -profile local,conda Executer (choose one): local slurm Engines (choose one): conda mamba docker singularity Per default: -profile slurm,singularity is executed. ``` ## Evaluating output GFF to the exon level We provided a script to analyze the output GFF of ANNOTATO (which also could be applied to the GFF file output of other pipelines) to report the number of exons per mRNA/tRNA. To run this, simply use: ``` python bin/analyze_exons.py -f ${GFF} ``` Below is the sample output of this script ``` INFORMATION REGARDING mRNA Number of transcripts: 33086 Largest number of exons in all transcripts: 128 Monoexonic transcripts: 4085 Multiexonic transcripts: 29001 Mono:Mult Ratio: 0.14 Boxplot of number of exons per transcript: Min: 1 25%: 2 50%: 4 75%: 8 Max: 128 Mean: 6.978812790908542 ================================================== INFORMATION REGARDING tRNA Number of transcripts: 2017 Largest number of exons in all transcripts: 1 Monoexonic transcripts: 2017 Multiexonic transcripts: 0 No multiexonic transcripts, unable to calculate Mono:Mult Ratio Boxplot of number of exons per transcript: Min: 1 25%: 1 50%: 1 75%: 1 Max: 1 Mean: 1.0 ================================================== ``` This script was originally written by [Katharina Hoff](https://github.com/Gaius-Augustus/GALBA/blob/main/scripts/analyze_exons.py) and was modified accordingly to suit the analysis of GFF file. ## Performance of the workflow on annotating difference eukaryote genomes The following table is the result predicted by ANNOTATO on difference species during the [Europe BioHackathon 2023](https://github.com/elixir-europe/biohackathon-projects-2023/tree/main/20). | Species | Genome size | N.Genes | N.Exons | N.mRNA | BUSCO lineage | BUSCO score | OMArk Completeness | OMArk Consistency | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | Drosophila melanogaster | 143M | 14,753 | 57,343 | 14,499 | diptera | C:96.1%[S:95.6%,D:0.5%],F:1.2%,M:2.7% | melanogaster subgroup, C:90.38%[S:84.32%,D:6.06%],M:9.62%,,n:12442 | A:94.21%[P:4.05%,F:7.28%],I:1.61%[P:0.5%,F:0.42%],C:0.00%[P:0.00%,F:0.00%],U:4.19% | | Helleia helle | 547M | 37,367 | 139,302 | 28,445 | lepidoptera | C:74.6%[S:73.4%,D:1.2%],F:5.4%,M:20.0% | Papilionidea, C:82.04%[S:66.12%,D:15.92%],M:17.96%, n:7939 | A:44.78%[P:14.41%,F:6.02%],I:3.53%[P:2.1%,F:0.7%],C:0.00%[P:0.00%,F:0.00%],U:51.69% | | Homo sapiens chrom 19 | 58M | 1,872 | 11,937 | 1,862 | primates | C:5.0%[S:4.8%,D:0.2%],F:0.5%,M:94.5% | Hominidae, C:8.57%[S:7.74%,D:0.83%],M:91.43%, n=17843 | A:87.54%[P:12.73%,F:13.1%],I:4.78%[P:1.5%,F:2.04%],C:0.00%[P:0.00%,F:0.00%],U:7.68% | | Melampus jaumei | 958M | 61,128 | 335,483 | 60,720 | mollusca | C:80.4%[S:67.2%,D:13.2%],F:3.8%,M:15.8% | Lophotrochozoa, C: 92.5%[S: 66.29%, D: 26.21%], M:7.5%, n:2373 | A:41.45%[P:15.72%,F:9.97%],I:15.97%[P:10.68%,F:3.07%],C:0.00%[P:0.00%,F:0.00%],U:42.57% | | Phakellia ventilabrum | 186M | 19,073 | 157,441 | 18,855 | metazoa | C:80.9%[S:79.2%,D:1.7%],F:6.5%,M:12.6% | Metazoa, C:86.79%[S:76.9%,D:9.9%],M:13.21% , n:3021 | A:53.81%[P:18.92%,F:5.06%],I:5.0%[P:2.7%,F:0.68%],C:0.00%[P:0.00%,F:0.00%],U:41.19% | | *Pocillopora* cf. *effusa* | 347M | 35,103 | 230,901 | 33,086 | metazoa | C:95.1%[S:92.2%,D:2.9%],F:1.7%,M:3.2% | Eumetazoa, C:94.16%[S:84.3%,D:9.86%],M:5.84%,n:3255 | A:52.94%[P:22.30%,F:3.69%],I:3.44%[P:2.08%,F:0.28%],C:0.00%[P:0.00%,F:0.00%],U:43.62% | | Trifolium dubium | 679M | 78,810 | 354,662 | 77,763 | fabales | C:95.1%[S:19.5%,D:75.6%],F:1.5%,M:3.4% | NPAAA clade, C:94.58%[S:19.21%,D:75.38%],M:5.42%,n:15412 | A:71.99%[P:11.03%,F:6.63%],I:2.77%[P:1.66%,F:0.52%],C:0.00%[P:0.00%,F:0.00%],U:25.23% | ## Future work - Python wrapper function to remove intermediate files - Adding functional annotation with `Interproscan` and `eggnog` - Adding PASA results to further improve the accuracy of the training - Adding custom parameter for both `BRAKER` and `funannotate`""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.654.2" ; ns1:isBasedOn ; ns1:isPartOf ; ns1:keywords "Annotation, Bioinformatics, Genomics, Nextflow, Transcriptomics, Workflows, rna-seq, DE_NOVO, ERGA, BGE, Biodiversity" ; ns1:license ; ns1:name "ANNOTATO - ERGA Genome Annotation Workflow in Nextflow" ; ns1:producer , , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "wolf_tutorial.zip?download=1" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-09T12:41:17Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:45:07Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "[Metabarcoding/eDNA through Obitools](https://training.galaxyproject.org/training-material/topics/ecology/tutorials/Obitools-metabarcoding/tutorial.html)" . This workflow allows to analyze DNA metabarcoding / eDNA data produced on Illumina sequencers using the OBITools.""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.655.1" ; ns1:input ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Obitools eDNA metabarcoding" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Reel_life_survey_fish_modif.tabular" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-11-09T12:47:49Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:44:51Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "[Biodiversity data exploration](https://training.galaxyproject.org/training-material/topics/ecology/tutorials/biodiversity-data-exploration/tutorial.html)" This workflow allows to explore biodiversity data looking at homoscedasticity, normality or collinearity of presences-absence or abundance data and at comparing beta diversity taking into account space, time and species components""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.656.1" ; ns1:input ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Biodiversity data exploration tutorial" ; ns1:output , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SENTINEL2A_20230210-111817-461_L2A_T30TWS_D.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/SENTINEL2A_20230214-105638-781_L2A_T31UET_D.zip" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-11-09T12:52:32Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:42:48Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "[Sentinel 2 biodiversity](https://training.galaxyproject.org/training-material/topics/ecology/tutorials/species-distribution-modeling/tutorial.html)" . This workflow allows to analyze remote sensing sentinel 2 satellites data to compute spectral indices such as the NDVI and visualizing biodiversity indicators """ ; ns1:input , ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Remote sensing Sentinel 2 data analysis to produce biodiversity metrics" ; ns1:output , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Pauline Seguineau" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ceamarc_env.csv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "cnidaria_filtered.csv" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-11-09T20:55:03Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:24:38Z"^^ns1:Date ; ns1:description """ Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "Antarctic sea ecoregionalization" . This workflow allows to analyze marine benthic biodiversity data to compute ecoregions regarding environmental data. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.658.1" ; ns1:input , ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Ecoregionalization on Antarctic sea" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "W1_oversample_tr_features.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "W1_oversample_tr_labels.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "W1_te_features.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "W1_te_labels.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "W1_val_labels.tsv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "labels.tsv" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-11-09T21:12:11Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:23:54Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "Deep learning to predict animal behavior" . This workflow allows to analyze animal behavior data through deep learning. """ ; ns1:input , , , , , ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Animal dive prediction using deep learning" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-10-27T12:14:22Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:45:44Z"^^ns1:Date ; ns1:description "Abstract CWL Automatically generated from the Galaxy workflow file: CLM-FATES_ ALP1 simulation (5 years)" ; ns1:image ; ns1:input , ; ns1:keywords "" ; ns1:license ; ns1:name "CLM-FATES_ALP1_simulation_5years" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1st reference FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2nd reference FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotation GTF file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomic FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reads FASTQ file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:Person ; ns1:name "GAPARS Horizon 2020 European project" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "classifications-000312899389-000316591628.csv.part_00000" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-10T08:41:20Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:23:38Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, to analyze crowdsourcing results of the SPIPOLL hoverflies GAPARS European project activity on MMOS server. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.660.1" ; ns1:input , ; ns1:isPartOf ; ns1:keywords "Biodiversity, Ecology, citizen science" ; ns1:license ; ns1:name "SPIPOLL MMOS GAPARS crowdsourcing results" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "champbloc_ivr.csv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "champbloc_qecb.csv" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "ficheterrain.csv" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-10T08:55:14Z"^^ns1:Date ; ns1:dateModified "2024-12-09T08:23:16Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "[Champs blocs](https://training.galaxyproject.org/training-material/topics/ecology/tutorials/champs-blocs/tutorial.html)" . This workflow allows to produce Visual Rollover Indicator and dissimilarity as diversity indices on boulder fields. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.661.1" ; ns1:input , , ; ns1:keywords "Biodiversity, Ecology" ; ns1:license ; ns1:name "Boulder fields indicators" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Occurrence_southpacific.csv" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-10T09:00:00Z"^^ns1:Date ; ns1:dateModified "2025-05-21T12:32:04Z"^^ns1:Date ; ns1:description """Galaxy Workflow created on Galaxy-E european instance, ecology.usegalaxy.eu, related to the Galaxy training tutorial "[OBIS marine indicators](https://training.galaxyproject.org/training-material/topics/ecology/tutorials/obisindicators/tutorial.html)" . This workflow allows to compute and visualize marine biodiversity indicators from OBIS data. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.662.1" ; ns1:input ; ns1:keywords "Biodiversity" ; ns1:license ; ns1:name "Obis biodiversity indicator on Asian pacific" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-11-10T15:14:13Z"^^ns1:Date ; ns1:dateModified "2023-11-10T15:15:41Z"^^ns1:Date ; ns1:description """**Name:** K-means **Contact Person**: support-compss@bsc.es **Access Level**: Public **License Agreement**: Apache2 **Platform**: COMPSs # Description K-means clustering is a method of cluster analysis that aims to partition ''n'' points into ''k'' clusters in which each point belongs to the cluster with the nearest mean. It follows an iterative refinement strategy to find the centers of natural clusters in the data. When executed with COMPSs, K-means first generates the input points by means of initialization tasks. For parallelism purposes, the points are split in a number of fragments received as parameter, each fragment being created by an initialization task and filled with random points. After the initialization, the algorithm goes through a set of iterations. In every iteration, a computation task is created for each fragment; then, there is a reduction phase where the results of each computation are accumulated two at a time by merge tasks; finally, at the end of the iteration the main program post-processes the merged result, generating the current clusters that will be used in the next iteration. Consequently, if ''F'' is the total number of fragments, K-means generates ''F'' computation tasks and ''F-1'' merge tasks per iteration. # Execution instructions Usage: ``` runcompss --classpath=application_sources/jar/kmeans.jar kmeans.KMeans <...> ``` where ''<...>'': * -c Number of clusters * -i Number of iterations * -n Number of points * -d Number of dimensions * -f Number of fragments # Execution Examples ``` runcompss --classpath=application_sources/jar/kmeans.jar kmeans.KMeans runcompss --classpath=application_sources/jar/kmeans.jar kmeans.KMeans -c 4 -i 10 -n 2000 -d 2 -f 2 ``` # Build ## Option 1: Native java ``` cd application_sources/; javac src/main/java/kmeans/*.java cd src/main/java/; jar cf kmeans.jar kmeans/ cd ../../../; mv src/main/java/kmeans.jar jar/ ``` ## Option 2: Maven ``` cd application_sources/ mvn clean package ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.663.1" ; ns1:image ; ns1:keywords "COMPSs, Java, Example, Tutorial, Marenostrum IV, Supercomputer, data_persistence" ; ns1:license ; ns1:name "Java COMPSs K-means clustering example (executed at Marenostrum IV supercomputer, inputs generated by the code)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-11-14T09:42:17Z"^^ns1:Date ; ns1:dateModified "2023-11-14T09:42:53Z"^^ns1:Date ; ns1:description """# covid-sequence-analysis-workflow This is the official repository of the SARS-CoV-2 variant surveillance pipeline developed by Danish Technical University (DTU), Eotvos Lorand University (ELTE), EMBL-EBI, Erasmus Medical Center (EMC) under the [Versatile Emerging infectious disease Observatory (VEO)](https://www.globalsurveillance.eu/projects/veo-versatile-emerging-infectious-disease-observatory) project. The project consists of 20 European partners. It is funded by the European Commission. The pipeline has been integrated on EMBL-EBI infrastructure to automatically process raw SARS-CoV-2 read data, presenting in the COVID-19 Data Portal: https://www.covid19dataportal.org/sequences?db=sra-analysis-covid19&size=15&crossReferencesOption=all#search-content. ## Architecture The pipeline supports sequence reads from both Illumina and Nanopore platforms. It is designed to be highly portable for both Google Cloud Platform and High Performance Computing cluster with IBM Spectrum LSF. We have performed secondary and tertiary analysis on millions of public samples. The pipeline shows good performance for large scale production. ![Component diagram](doc/img/pipeline.components.png) The pipeline takes SRA from the public FTP from ENA. It submits analysis objects back to ENA on the fly. The intermediate results and logs are stored in the cloud storage buckets or high performance local POSIX file system. The metadata is stored in Google BigQuery for metadata and status tracking and analysis. The runtime is created with Docker / Singularity containers and NextFlow. ## Process to run the pipelines The pipeline requires the Nextflow Tower for the application level monitoring. A free test account can be created for evaluation purposes at https://tower.nf/. ### Preparation 1. Store `export TOWER_ACCESS_TOKEN='...'` in `$HOME/.bash_profile`. Restart the current session or source the updated `$HOME/.bash_profile`. 2. Run `git clone https://github.com/enasequence/covid-sequence-analysis-workflow`. 3. Create `./covid-sequence-analysis-workflow/data/projects_accounts.csv` with submission_account_id and submission_passwor, for example: > project_id,center_name,meta_key,submission_account_id,submission_password,ftp_password > PRJEB45555,"European Bioinformatics Institute",public,,, ### Running pipelines 1. Run `./covid-sequence-analysis-workflow/init.sra_index.sh` to initialize or reinitialize the metadata in BigQuery. 2. Run `./covid-sequence-analysis-workflow/./start.lsf.jobs.sh` with proper parameters to start the batch jobs on LSF or `./covid-sequence-analysis-workflow/./start.gls.jobs.sh` with proper parameters to start the batch jobs on GCP. ### Error handling If a job is killed or died, run the following to update the metadata to avoid reprocessing samples completed successfully. 1. Run `./covid-sequence-analysis-workflow/update.receipt.sh ` to collect the submission receipts and to update submission metadata. The script can be run at anytime. It needs to be run if a batch job is killed instead of completed for any reason. 2. Run `./covid-sequence-analysis-workflow/set.archived.sh` to update stats for analyses submitted. The script can be run at anytime. It needs to be run at least once before ending a snapshot to make sure that the stats are up-to-date. To reprocess the samples failed, delete the record in `sra_processing`. """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.664.1" ; ns1:isPartOf ; ns1:keywords "pathogen, SARS-CoV-2, Genomics" ; ns1:license ; ns1:name "covid-sequence-analysis-workflow" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-11-14T11:55:56Z"^^ns1:Date ; ns1:dateModified "2023-11-14T12:00:50Z"^^ns1:Date ; ns1:description """# ![sanger-tol/readmapping](docs/images/sanger-tol-readmapping_logo.png) [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.6563577-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.6563577) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/readmapping) ## Introduction **sanger-tol/readmapping** is a bioinformatics best-practice analysis pipeline for mapping reads generated using Illumina, HiC, PacBio and Nanopore technologies against a genome assembly. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! On merge to `dev` and `main` branch, automated continuous integration tests run the pipeline on a full-sized dataset on the Wellcome Sanger Institute HPC farm using the Nextflow Tower infrastructure. This ensures that the pipeline runs on full sized datasets, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. ## Pipeline summary ## Quick Start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash nextflow run sanger-tol/readmapping -profile test,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! ```bash nextflow run sanger-tol/readmapping --input samplesheet.csv --fasta genome.fa.gz --outdir -profile ``` ## Credits sanger-tol/readmapping was originally written by [Priyanka Surana](https://github.com/priyanka-surana). We thank the following people for their extensive assistance in the development of this pipeline: - [Matthieu Muffato](https://github.com/muffato) for the text logo - [Guoying Qi](https://github.com/gq1) for being able to run tests using Nf-Tower and the Sanger HPC farm ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#pipelines` channel](https://sangertreeoflife.slack.com/channels/pipelines). Please [create an issue](https://github.com/sanger-tol/readmapping/issues/new/choose) on GitHub if you are not on the Sanger slack channel. ## Citations If you use sanger-tol/readmapping for your analysis, please cite it using the following doi: [10.5281/zenodo.6563577](https://doi.org/10.5281/zenodo.6563577) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:keywords "" ; ns1:license ; ns1:name "sanger-tol/readmapping v1.1.0 - Hebridean Black" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-14T12:03:59Z"^^ns1:Date ; ns1:dateModified "2023-11-14T12:03:59Z"^^ns1:Date ; ns1:description """# ![sanger-tol/ensemblgenedownload](docs/images/sanger-tol-ensemblgenedownload_logo.png) [![GitHub Actions CI Status](https://github.com/sanger-tol/ensemblgenedownload/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/ensemblgenedownload/actions?query=workflow%3A%22nf-core+CI%22) [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7183206-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7183206) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Get help on Slack](http://img.shields.io/badge/slack-SangerTreeofLife%20%23pipelines-4A154B?labelColor=000000&logo=slack)](https://SangerTreeofLife.slack.com/channels/pipelines) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40sangertol-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/sangertol) [![Watch on YouTube](http://img.shields.io/badge/youtube-tree--of--life-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/channel/UCFeDpvjU58SA9V0ycRXejhA) ## Introduction **sanger-tol/ensemblgenedownload** is a pipeline that downloads gene annotations from Ensembl into the Tree of Life directory structure. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! On release, automated continuous integration tests run the pipeline on a full-sized dataset on the GitHub CI infrastructure. This ensures that the pipeline runs in a third-party environment, and has sensible resource allocation defaults set to run on real-world datasets. ## Pipeline summary ## Overview The pipeline takes a CSV file that contains assembly accession number, Ensembl species names (as they may differ from Tree of Life ones !), output directories, and geneset versions. Assembly accession numbers are optional. If missing, the pipeline assumes it can be retrieved from files named `ACCESSION` in the standard location on disk. The pipeline downloads the Fasta files of the genes (cdna, cds, and protein sequences) as well as the GFF3 file. All files are compressed with `bgzip`, and indexed with `samtools faidx` or `tabix`. Steps involved: - Download from Ensembl the GFF3 file, and the sequences of the genes in Fasta format. - Compress and index all Fasta files with `bgzip`, `samtools faidx`, and `samtools dict`. - Compress and index the GFF3 file with `bgzip` and `tabix`. ## Quick Start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.04.0`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash nextflow run sanger-tol/ensemblgenedownload -profile test,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! ```console nextflow run sanger-tol/ensemblgenedownload --input $PWD/assets/samplesheet.csv --outdir -profile ``` ## Documentation The sanger-tol/ensemblgenedownload pipeline comes with documentation about the pipeline [usage](docs/usage.md) and [output](docs/output.md). ## Credits sanger-tol/ensemblgenedownload was originally written by @muffato. ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#pipelines` channel](https://sangertreeoflife.slack.com/channels/pipelines). Please [create an issue](https://github.com/sanger-tol/ensemblgenedownload/issues/new/choose) on GitHub if you are not on the Sanger slack channel. ## Citations If you use sanger-tol/ensemblgenedownload for your analysis, please cite it using the following doi: [10.5281/zenodo.7183206](https://doi.org/10.5281/zenodo.7183206) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:keywords "" ; ns1:license ; ns1:name "sanger-tol/insdcdownload v1.0.1 - Hefty mûmakil" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-14T12:06:10Z"^^ns1:Date ; ns1:dateModified "2023-11-14T12:06:10Z"^^ns1:Date ; ns1:description """# ![sanger-tol/ensemblrepeatdownload](docs/images/sanger-tol-ensemblrepeatdownload_logo.png) [![GitHub Actions CI Status](https://github.com/sanger-tol/ensemblrepeatdownload/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/ensemblrepeatdownload/actions?query=workflow%3A%22nf-core+CI%22) [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7183380-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7183380) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Get help on Slack](http://img.shields.io/badge/slack-SangerTreeofLife%20%23pipelines-4A154B?labelColor=000000&logo=slack)](https://SangerTreeofLife.slack.com/channels/pipelines) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40sangertol-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/sangertol) [![Watch on YouTube](http://img.shields.io/badge/youtube-tree--of--life-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/channel/UCFeDpvjU58SA9V0ycRXejhA) ## Introduction **sanger-tol/ensemblrepeatdownload** is a pipeline that downloads repeat annotations from Ensembl into a Tree of Life directory structure. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! On release, automated continuous integration tests run the pipeline on a full-sized dataset on the GitHub CI infrastructure. This ensures that the pipeline runs in a third-party environment, and has sensible resource allocation defaults set to run on real-world datasets. ## Pipeline summary ## Overview The pipeline takes a CSV file that contains assembly accession number, Ensembl species names (as they may differ from Tree of Life ones !), output directories. Assembly accession numbers are optional too. If missing, the pipeline assumes it can be retrieved from files named `ACCESSION` in the standard location on disk. The pipeline downloads the repeat annotation as the masked Fasta file and a BED file. All files are compressed with `bgzip`, and indexed with `samtools faidx` or `tabix`. Steps involved: - Download the masked fasta file from Ensembl. - Extract the coordinates of the masked regions into a BED file. - Compress and index the BED file with `bgzip` and `tabix`. ## Quick Start 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.04.0`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: ```bash nextflow run sanger-tol/ensemblrepeatdownload -profile test,YOURPROFILE --outdir ``` Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! ```console nextflow run sanger-tol/ensemblrepeatdownload --input $PWD/assets/samplesheet.csv --outdir -profile ``` ## Documentation The sanger-tol/ensemblrepeatdownload pipeline comes with documentation about the pipeline [usage](docs/usage.md) and [output](docs/output.md). ## Credits sanger-tol/ensemblrepeatdownload was originally written by @muffato. ## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#pipelines` channel](https://sangertreeoflife.slack.com/channels/pipelines). Please [create an issue](https://github.com/sanger-tol/ensemblrepeatdownload/issues/new/choose) on GitHub if you are not on the Sanger slack channel. ## Citations If you use sanger-tol/ensemblrepeatdownload for your analysis, please cite it using the following doi: [10.5281/zenodo.7183380](https://doi.org/10.5281/zenodo.7183380) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:keywords "" ; ns1:license ; ns1:name "sanger-tol/ensemblrepeatdownload v1.0.0 - Gwaihir the Windlord" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2023-11-14T12:10:54Z"^^ns1:Date ; ns1:dateModified "2023-11-14T12:10:54Z"^^ns1:Date ; ns1:description """[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/sanger-tol/treeval) ## Introduction **sanger-tol/treeval** is a bioinformatics best-practice analysis pipeline for the generation of data supplemental to the curation of reference quality genomes. This pipeline has been written to generate flat files compatible with [JBrowse2](https://jbrowse.org/jb2/). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! The treeval pipeline has a sister pipeline currently named [curationpretext](https://github.com/sanger-tol/curationpretext) which acts to regenerate the pretext maps and accessory files during genomic curation in order to confirm interventions. This pipeline is sufficiently different to the treeval implementation that it is written as it's own pipeline. 1. Parse input yaml ( YAML_INPUT ) 2. Generate my.genome file ( GENERATE_GENOME ) 3. Generate insilico digests of the input assembly ( INSILICO_DIGEST ) 4. Generate gene alignments with high quality data against the input assembly ( GENE_ALIGNMENT ) 5. Generate a repeat density graph ( REPEAT_DENSITY ) 6. Generate a gap track ( GAP_FINDER ) 7. Generate a map of self complementary sequence ( SELFCOMP ) 8. Generate syntenic alignments with a closely related high quality assembly ( SYNTENY ) 9. Generate a coverage track using PacBio data ( LONGREAD_COVERAGE ) 10. Generate HiC maps, pretext and higlass using HiC cram files ( HIC_MAPPING ) 11. Generate a telomere track based on input motif ( TELO_FINDER ) 12. Run Busco and convert results into bed format ( BUSCO_ANNOTATION ) 13. Ancestral Busco linkage if available for clade ( BUSCO_ANNOTATION:ANCESTRAL_GENE ) ## Usage > **Note** > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how > to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) > with `-profile test` before running the workflow on actual data. Currently, it is advised to run the pipeline with docker or singularity as a small number of major modules do not currently have a conda env associated with them. Now, you can run the pipeline using: ```bash # For the FULL pipeline nextflow run main.nf -profile singularity --input treeval.yaml --outdir {OUTDIR} # For the RAPID subset nextflow run main.nf -profile singularity --input treeval.yaml -entry RAPID --outdir {OUTDIR} ``` An example treeval.yaml can be found [here](assets/local_testing/nxOscDF5033.yaml). Further documentation about the pipeline can be found in the following files: [usage](https://pipelines.tol.sanger.ac.uk/treeval/dev/usage), [parameters](https://pipelines.tol.sanger.ac.uk/treeval/dev/parameters) and [output](https://pipelines.tol.sanger.ac.uk/treeval/dev/output). > **Warning:** > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). ## Credits sanger-tol/treeval has been written by Damon-Lee Pointon (@DLBPointon), Yumi Sims (@yumisims) and William Eagles (@weaglesBio). We thank the following people for their extensive assistance in the development of this pipeline:
  • @gq1 - For building the infrastructure around TreeVal and helping with code review
  • @ksenia-krasheninnikova - For help with C code implementation and YAML parsing
  • @mcshane - For guidance on algorithms
  • @muffato - For code reviews and code support
  • @priyanka-surana - For help with the majority of code reviews and code support
## Contributions and Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). ## Citations If you use sanger-tol/treeval for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX). ### Tools An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). """ ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "sanger-tol/treeval v1.0 - Ancient Atlantis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-11-03T19:46:07Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "RNA-RNA interactome analysis using ChiRA tools suite. The aligner used is BWA-MEM." ; ns1:input , , , , ; ns1:keywords "rna, Transcriptomics" ; ns1:license ; ns1:name "RNA-RNA interactome analysis using BWA-MEM" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "1st reference FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "2nd reference FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Annotation GTF file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomic FASTA file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reads FASTQ file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-11-16T14:27:39Z"^^ns1:Date ; ns1:dateModified "2023-11-16T14:29:36Z"^^ns1:Date ; ns1:description """**Contact Person:** support-compss@bsc.es **Access Level:** public **License Agreement:** Apache2 **Platform:** COMPSs # Description Simple is an application that takes one value and increases it by five units. The purpose of this application is to show how tasks are managed by COMPSs. # Execution instructions Usage: ``` runcompss --lang=python src/simple.py initValue ``` where: * initValue: Initial value for counter # Execution Examples ``` runcompss --lang=python src/simple.py 1 runcompss src/simple.py 1 python -m pycompss src/simple.py 1 ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.673.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Example, Tutorial, Laptop, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs simple example (ran on macOS laptop, input generated by the code, INOUT file example)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-11-21T09:02:46Z"^^ns1:Date ; ns1:dateModified "2023-11-21T10:28:24Z"^^ns1:Date ; ns1:description """# Evaluation of Swin Transformer and knowledge transfer for denoising of super-resolution structured illumination microscopy data In recent years, convolutional neural network (CNN)-based methods have shown remarkable performance in the denoising and reconstruction of super-resolved structured illumination microscopy (SR-SIM) data. Therefore, CNN-based architectures have been the main focus of existing studies. Recently, however, an alternative and highly competitive deep learning architecture, Swin Transformer, has been proposed for image restoration tasks. In this work, we present SwinT-fairSIM, a novel method for restoring SR-SIM images with low signal-to-noise ratio (SNR) based on Swin Transformer. The experimental results show that SwinT-fairSIM outperforms previous CNN-based denoising methods. Furthermore, the generalization capabilities of deep learning methods for image restoration tasks on real fluorescence microscopy data have not been fully explored yet, i.e., the extent to which trained artificial neural networks are limited to specific types of cell structures and noise. Therefore, as a second contribution, we benchmark two types of transfer learning, i.e., direct transfer and fine-tuning, in combination with SwinT-fairSIM and two CNN-based methods for denoising SR-SIM data. Direct transfer does not prove to be a viable strategy, but fine-tuning achieves results comparable to conventional training from scratch while saving computational time and potentially reducing the amount of required training data. As a third contribution, we published four datasets of raw SIM images and already reconstructed SR-SIM images. These datasets cover two types of cell structures, tubulin filaments and vesicle structures. Different noise levels are available for the tubulin filaments. These datasets are structured in such a way that they can be easily used by the research community for research on denoising, super-resolution, and transfer learning strategies. The SIM microscopy datasets that were used during this work can be downloaded through this link: http://dx.doi.org/10.5524/102461 ## Installation: This implementation requires the Tensorflow-GPU2.5 version. To avoid package conflicts, we recommend you create a new environment by using our provided environment.yml file. To create a new environment please run the following script: > conda env create -f environment.yml ## How to use this code: This code can be used to train a denoising model from scratch or to fine-tune a pretrained model. After the installation of the Python environment from the yml file, the next step is to set the input parameters in the JSON parameter file (i.e., ParameterFile.json). Most of the input parameters are self-explanatory but below we will discuss some of the important input parameters from the JSON file: - TrainNetworkfromScratch: This input parameter will train the model from scratch If set to True, otherwise, for fine-tuning, It should be False. - ActivateTrainandTestModel: This parameter will be set to False If you want to use this code for evaluation of the trained model or the reproducibility of the results by using pretrained models. - PretrainedmodelPath: This parameter is mandatory in case of fine-tuning or evaluation of a pretrained model. - FineTuneStartingpoint and FineTuneEndingpoint: These two input parameters are essential in the fine-tuning of a pretrained model. All the layers between the starting and ending points will be frozen during the fine-tuning of the pretrained model. After the assignment of the input parameters. You can run the following script from the command line to start training the model: > python MainModule.py 'ParameterFile.json' ## Reproducibility and evaluation: To reproduce the results of the paper all the trained models used in this work are available in the 'Models' directory at [zenodo](https://doi.org/10.5281/zenodo.7626173). This code is capable of performing all the necessary steps for the training and test phases. It will automatically evaluate the model and generate a result directory to write all the results. Similarly, during the training process, It will also create a model directory and save the trained model along with the best checkpoints in the model directory. ## Important Note: This code will work with at least one GPU. ## Reference: Please cite our paper in case you use this code for any scientific publication. We will soon upload the citation index! """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.675.1" ; ns1:keywords "Machine Learning, Python, image processing, SIM, microscopy, Deep learning" ; ns1:license ; ns1:name "Evaluation of Swin Transformer and knowledge transfer for denoising of super-resolution structured illumination microscopy data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Katarzyna Kamieniecka" . a ns1:Person ; ns1:name "Krzysztof Poterlowicz" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:Person ; ns1:name "khaled Jumah" . a ns1:Person ; ns1:name "poterlowicz-lab" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Normal_r1.fastq.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Normal_r2.fastq.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Tumor_r1.fastq.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Tumor_r2.fastq.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "capture_targets_chr5_12_17.bed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_chr_sorted_circos" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "out_ratio_log2_circos" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_png" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2023-11-23T17:19:43Z"^^ns1:Date ; ns1:dateModified "2024-04-03T15:20:03Z"^^ns1:Date ; ns1:description "This workflow is created as part of a tutorial listed on GTN. The workflow shows the steps in human copy number variance detection using the Contrl_FREEC tool. " ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.676.1" ; ns1:input , , , , ; ns1:keywords "hCNV, variant-analysis, MIRACUM" ; ns1:license ; ns1:name "Somatic-Variant-Discovery-from-WES-Data-Using-Control-FREEC" ; ns1:output , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mass-spectrometry Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "SampleMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "CAMERA.annotate variableMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "dataMatrix" . a ns1:Person ; ns1:name "workflow4metabolomics" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-05-29T02:02:06Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:34Z"^^ns1:Date ; ns1:description """This workflow is composed with the XCMS tool R package (Smith, C.A. 2006) able to extract, filter, align and fill gapand the possibility to annotate isotopes, adducts and fragments using the CAMERA R package (Kuhl, C 2012). https://training.galaxyproject.org/training-material/topics/metabolomics/tutorials/lcms-preprocessing/tutorial.html """ ; ns1:input , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "lcms-preprocessing/main" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-11-24T08:35:41Z"^^ns1:Date ; ns1:dateModified "2023-11-24T08:38:06Z"^^ns1:Date ; ns1:description """**Name:** Increment **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Increment is an application that takes three different values and increases them a number of given times. The purpose of this application is to show parallelism between the different increments. # Execution instructions Usage: ``` runcompss --lang=python src/increment.py N initValue1 initValue2 initValue3 ``` where: * N: Number of times to increase the counters * initValue1: Initial value for counter 1 * initValue2: Initial value for counter 2 * initValue3: Initial value for counter 3 # Execution Examples ``` runcompss --lang=python src/increment.py 10 1 2 3 runcompss src/wordcount.py src/increment.py 10 1 2 3 python -m pycompss src/wordcount.py src/increment.py 10 1 2 3 ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.678.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Example, Tutorial, Supercomputer, Marenostrum IV, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Increment example, ran at Marenostrum IV supercomputer, example of INOUT file and compss_open usage" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-11-03T19:49:27Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description "RNA-RNA interactome analysis using ChiRA tools suite. The aligner used is CLAN." ; ns1:input , , , , ; ns1:keywords "rna, Transcriptomics" ; ns1:license ; ns1:name "RNA-RNA interactome analysis using CLAN" ; ns1:output , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse read" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_9" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mass-spectrometry Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "sampleMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Multivariate sampleMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Multivariate variableMetadata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "metaMS.runGC dataMatrix" . a ns1:Person ; ns1:name "workflow4metabolomics" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2025-05-20T02:01:59Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:34Z"^^ns1:Date ; ns1:description """This workflow is composed with the XCMS tool R package (Smith, C.A. 2006) able to extract and the metaMS R package (Wehrens, R 2014) for the field of untargeted metabolomics. https://training.galaxyproject.org/training-material/topics/metabolomics/tutorials/gcms/tutorial.html""" ; ns1:input , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "gcms-metams/main" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "BridgeDB cache" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Differential gene expression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Differential miRNA expression" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STRING identifier mapping" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "mRNA expression correlation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Correlation limit" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "miRNA mRNA correlation" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "miRTarBase data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "miRTarBase limit" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MOGAMUN cores" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MOGAMUN generations" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Maximum subnetwork size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Subnetwork merge threshold" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Minimum subnetwork size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "MOGAMUN runs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STRING data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STRING filter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STRING limit" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Variant Burden" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Full network" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Subnetworks" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-11-27T16:19:23Z"^^ns1:Date ; ns1:dateModified "2024-02-01T11:27:19Z"^^ns1:Date ; ns1:description "Workflow for Creating a large disease network from various datasets and databases for IBM, and applying the active subnetwork identification method MOGAMUN." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.681.7" ; ns1:image ; ns1:input , , , , , , , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "Bioinformatics, CWL, Genomics, Transcriptomics, Protein-Protein Interaction" ; ns1:license ; ns1:name "Inclusion Body Myositis Active Subnetwork Identification Workflow" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 7 . a ns1:Person ; ns1:name "Craig Windell" . a ns1:Person ; ns1:name "Magdalena Antczak" . a ns1:Person ; ns1:name "Marie-Emilie Gauthier" . a ns1:Person ; ns1:name "Roberto Barrero" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2024-12-18T04:26:23Z"^^ns1:Date ; ns1:dateModified "2024-12-18T04:29:36Z"^^ns1:Date ; ns1:description """# ONTViSc (ONT-based Viral Screening for Biosecurity) ## Introduction eresearchqut/ontvisc is a Nextflow-based bioinformatics pipeline designed to help diagnostics of viruses and viroid pathogens for biosecurity. It takes fastq files generated from either amplicon or whole-genome sequencing using Oxford Nanopore Technologies as input. The pipeline can either: 1) perform a direct search on the sequenced reads, 2) generate clusters, 3) assemble the reads to generate longer contigs or 4) directly map reads to a known reference. The reads can optionally be filtered from a plant host before performing downstream analysis. ## Pipeline overview - Data quality check (QC) and preprocessing - Merge fastq files (Fascat, optional) - Raw fastq file QC (Nanoplot) - Trim adaptors (PoreChop ABI - optional) - Filter reads based on length and/or quality (Chopper - optional) - Reformat fastq files so read names are trimmed after the first whitespace (bbmap) - Processed fastq file QC (if PoreChop and/or Chopper is run) (Nanoplot) - Host read filtering - Align reads to host reference provided (Minimap2) - Extract reads that do not align for downstream analysis (seqtk) - QC report - Derive read counts recovered pre and post data processing and post host filtering - Read classification analysis mode - Clustering mode - Read clustering (Rattle) - Convert fastq to fasta format (seqtk) - Cluster scaffolding (Cap3) - Megablast homology search against ncbi or custom database (blast) - Derive top candidate viral hits - Align reads back to top reference and derive coverage statistics (mosdepth and coverM) - De novo assembly mode - De novo assembly (Canu or Flye) - Megablast homology search against ncbi or custom database or reference (blast) - Derive top candidate viral hits - Align reads back to top reference and derive coverage statistics (mosdepth and coverM) - Read classification mode - Option 1 Nucleotide-based taxonomic classification of reads (Kraken2, Braken) - Option 2 Protein-based taxonomic classification of reads (Kaiju, Krona) - Option 3 Convert fastq to fasta format (seqtk) and perform direct homology search using megablast (blast) - Map to reference mode - Align reads to reference fasta file (Minimap2) and derive bam file and alignment statistics (Samtools) Code and detailed instructions can be found [here](https://github.com/eresearchqut/ontvisc). A comprehensive, step-by-step guide on setting up and executing the ONTViSc pipeline across three high-performance computing systems hosted by Australian research and computing facilities - Lyra (Queensland University of Technology), Gadi (National Computational Infrastructure), and Setonix (Pawsey) - utilising the Australian Nextflow Seqera Service, can be found [here](https://mantczakaus.github.io/ontvisc_hpc_seqera_service_guide/). ## Authors Marie-Emilie Gauthier Craig Windell Magdalena Antczak Roberto Barrero """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.683.3" ; ns1:image ; ns1:isBasedOn ; ns1:keywords "Assembly, Bioinformatics, Virology, blast, Nextflow, ONT, singularity, Virus" ; ns1:license ; ns1:name "ONTViSc (ONT-based Viral Screening for Biosecurity)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-12-04T14:22:40Z"^^ns1:Date ; ns1:dateModified "2023-12-04T14:48:29Z"^^ns1:Date ; ns1:description """**Name:** Java Wordcount **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Wordcount application. There are two versions of Wordcount, depending on how the input data is given. ## Version 1 ''Single input file'', where all the text is given in the same file and the chunks are calculated with a BLOCK_SIZE parameter. ## Version 2 ''Multiple input files'', where the text fragments are already in different files under the same directory # Execution instructions Usage: ``` runcompss --classpath=application_sources/jar/wordcount.jar wordcount.multipleFiles.Wordcount DATA_FOLDER runcompss --classpath=application_sources/jar/wordcount.jar wordcount.uniqueFile.Wordcount DATA_FILE BLOCK_SIZE ``` where: * DATA_FOLDER: Absolute path to the base folder of the dataset files * DATA_FILE: Absolute path to the dabase file * BLOCK_SIZE: Number of bytes of each block # Execution Examples ``` runcompss --classpath=application_sources/jar/wordcount.jar wordcount.multipleFiles.Wordcount dataset/data-set/ runcompss --classpath=application_sources/jar/wordcount.jar wordcount.uniqueFile.Wordcount dataset/data-set/file_small.txt 650 runcompss --classpath=application_sources/jar/wordcount.jar wordcount.uniqueFile.Wordcount dataset/data-set/file_long.txt 250000 ``` # Build ## Option 1: Native java ``` cd application_sources/; javac src/main/java/wordcount/*.java cd src/main/java/; jar cf wordcount.jar wordcount/ cd ../../../; mv src/main/java/wordcount.jar jar/ ``` ## Option 2: Maven ``` cd application_sources/ mvn clean package ``` """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.684.1" ; ns1:image ; ns1:keywords "Java, COMPSs, Tutorial, Example, Laptop, data_persistence" ; ns1:license ; ns1:name "Java COMPSs wordcount example (laptop run, files used as inputs)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "David Lähnemann" . a ns1:Person ; ns1:name "Felix Mölder" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2023-12-14T08:17:10Z"^^ns1:Date ; ns1:dateModified "2023-12-14T08:17:10Z"^^ns1:Date ; ns1:description """# Snakemake workflow: dna-seq-varlociraptor [![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io) [![GitHub actions status](https://github.com/snakemake-workflows/dna-seq-varlociraptor/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/dna-seq-varlociraptor/actions?query=branch%3Amaster+workflow%3ATests) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4675661.svg)](https://doi.org/10.5281/zenodo.4675661) A Snakemake workflow for calling small and structural variants under any kind of scenario (tumor/normal, tumor/normal/relapse, germline, pedigree, populations) via the unified statistical model of [Varlociraptor](https://varlociraptor.github.io). ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows%2Fdna-seq-varlociraptor). If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). """ ; ns1:isBasedOn ; ns1:keywords "Bioinformatics" ; ns1:license ; ns1:name "dna-seq-varlociraptor" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)>, ; ns1:dateCreated "2023-12-15T14:57:59Z"^^ns1:Date ; ns1:dateModified "2023-12-15T15:00:35Z"^^ns1:Date ; ns1:description """**Name:** Word Count **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs # Description Wordcount is an application that counts the number of words for a given set of files. To allow parallelism the file is divided in blocks that are treated separately and merged afterwards. Results are printed to a Pickle binary file, so they can be checked using: python -mpickle result.txt This example also shows how to manually add input or output datasets to the workflow provenance recording (using the 'input' and 'output' terms in the ro-crate-info.yaml file). # Execution instructions Usage: ``` runcompss --lang=python $(pwd)/application_sources/src/wordcount_blocks.py filePath resultPath blockSize ``` where: * filePath: Absolute path of the file to parse * resultPath: Absolute path to the result file * blockSize: Size of each block. The lower the number, the more tasks will be generated in the workflow # Execution Examples ``` runcompss --lang=python $(pwd)/application_sources/src/wordcount_blocks.py $(pwd)/dataset/data/compss.txt result.txt 300 runcompss $(pwd)/application_sources/src/wordcount_blocks.py $(pwd)/dataset/data/compss.txt result.txt 300 python -m pycompss $(pwd)/application_sources/src/wordcount.py $(pwd)/dataset/data/compss.txt result.txt 300 ``` # Build No build is required """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.687.1" ; ns1:image ; ns1:keywords "PyCOMPSs, Tutorial, Example, Marenostrum IV, Supercomputer, data_persistence" ; ns1:license ; ns1:name "PyCOMPSs Wordcount test, dividing input file in blocks, only Python dictionaries used as task parameters (run at MareNostrum IV)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Clinical Bioinformatics Unit" . a ns1:Person ; ns1:name "Eramus Medical Center" . a ns1:Person ; ns1:name "Helena Rasche" . a ns1:Person ; ns1:name "Iacopo Cristoferi" . a ns1:Person ; ns1:name "Pathology Department" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/UCSC Genome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/mRNA-Seq Reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/html_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/output_feature_lengths" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/output_short" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2023-12-19T10:10:00Z"^^ns1:Date ; ns1:dateModified "2024-01-24T09:42:47Z"^^ns1:Date ; ns1:description "This portion of the workflow produces sets of feature Counts ready for analysis by limma/etc." ; ns1:image ; ns1:input , ; ns1:keywords "BY-COVID, covid-19" ; ns1:license ; ns1:name "mRNA-Seq BY-COVID Pipeline: Counts" ; ns1:output , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Clinical Bioinformatics Unit" . a ns1:Person ; ns1:name "Eramus Medical Center" . a ns1:Person ; ns1:name "Helena Rasche" . a ns1:Person ; ns1:name "Iacopo Cristoferi" . a ns1:Person ; ns1:name "Pathology Department" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "factordata" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts: Counts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "featureCounts: Lengths" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "count_data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "limma_report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "minerva_table" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2023-12-19T10:10:54Z"^^ns1:Date ; ns1:dateModified "2024-01-24T09:43:21Z"^^ns1:Date ; ns1:description "Analyse Bulk RNA-Seq data in preparation for downstream Pathways analysis with MINERVA" ; ns1:image ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "BY-COVID, covid-19" ; ns1:license ; ns1:name "mRNA-Seq BY-COVID Pipeline: Analysis" ; ns1:output , , ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-11-04T18:35:15Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description """This WF is based on the official Covid19-Galaxy assembly workflow as available from https://covid19.galaxyproject.org/genomics/2-assembly/ . It has been adapted to suit the needs of the analysis of metagenomics sequencing data. Prior to be submitted to INDSC databases, these data need to be cleaned from contaminant reads, including reads of possible human origin. The assembly of the SARS-CoV-2 genome is performed using both the Unicycler and the SPAdes assemblers, similar to the original WV. To facilitate the deposition of raw sequencing reads in INDSC databases, different fastq files are saved during the different steps of the WV. Which reflect different levels of stringency/filtration: (1) Initially fastq are filtered to remove human reads. (2) Subsequently, a similarity search is performed against the reference assembly of the SARS-CoV-2 genome, to retain only SARS-CoV-2 like reads. (3) Finally, SARS-CoV-2 reads are assembled, and the bowtie2 program is used to identify (and save in the corresponding fastq files) only reads that are completely identical to the final assembly of the genome. Any of the fastq files produced in (1), (2) or (3) are suitable for being submitted in raw reads repositories. While the files filtered according to (1) are richer and contain more data, including for example genomic sequences of different microbes living in the oral cavity; files filtered according to (3) contain only the reads that are completely identical to the final assembly. This should guarantee that any re-analysis/re-assembly of these always produce consistent and identical results. File obtained at (2) include all the reads in the sequencing reaction that had some degree of similarity with the reference SARS-CoV-2 genome, these may include subgenomic RNAs, but also polymorphic regions/variants in the case of a coinfection by multiple SARS-CoV-2 strains. Consequently, reanalysis of these data is not guarateed to produce identical and consistent results, depending on the parameters used during the assembly. However, these data contain more information. Please feel free to comment, ask questions and/or add suggestions """ ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "MC_COVID19like_Assembly_Reads" ; ns1:output , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:name "Scipion" ; ns1:url . a ns1:Person ; ns1:name "Workflows and Distributed Computing" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2023-12-19T15:45:41Z"^^ns1:Date ; ns1:dateModified "2023-12-19T15:48:10Z"^^ns1:Date ; ns1:description """**Name:** Lanczos SVD **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs **Machine**: MareNostrum4 Lanczos SVD for computing singular values needed to reach an epsilon of 1e-3 on a matrix of (150000, 150). The input matrix is generated synthetically. This application used [dislib-0.9.0](https://github.com/bsc-wdc/dislib/tree/release-0.9) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.690.1" ; ns1:image ; ns1:keywords "PyCOMPSs, dislib, HPC, Marenostrum IV, Supercomputer, non_data_persistence" ; ns1:license ; ns1:name "Lanczos SVD" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , ; ns1:dateCreated "2023-12-20T01:12:30Z"^^ns1:Date ; ns1:dateModified "2023-12-20T01:30:37Z"^^ns1:Date ; ns1:description "This is a Nextflow implementaion of the GATK Somatic Short Variant Calling workflow. This workflow can be used to discover somatic short variants (SNVs and indels) from tumour and matched normal BAM files following GATK's Best Practices Workflow. The workflowis currently optimised to run efficiently and at scale on the National Compute Infrastructure, Gadi." ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.691.1" ; ns1:keywords "Bioinformatics, FAIR workflows, GATK4, INDELs, Nextflow, variant calling, workflow, cancer, Somatic, snv, Genomics, human, WGS, HPC" ; ns1:license ; ns1:name "Somatic-ShortV-nf" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Joser Carbonel" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2023-12-20T13:01:39Z"^^ns1:Date ; ns1:dateModified "2024-01-24T09:48:16Z"^^ns1:Date ; ns1:description """# Cancer Invasion Workflow ## Table of Contents - [Cancer Invasion Workflow](#cancer-invasion-workflow) - [Table of Contents](#table-of-contents) - [Description](#description) - [Contents](#contents) - [Building Blocks](#building-blocks) - [Workflows](#workflows) - [Resources](#resources) - [Tests](#tests) - [Instructions](#instructions) - [Local machine](#local-machine) - [Requirements](#requirements) - [Usage steps](#usage-steps) - [MareNostrum 4](#marenostrum-4) - [Requirements in MN4](#requirements-in-mn4) - [Usage steps in MN4](#usage-steps-in-mn4) - [Mahti or Puhti](#mahti-or-puhti) - [Requirements](#requirements) - [Steps](#steps) - [License](#license) - [Contact](#contact) ## Description Uses multiscale simulations to describe cancer progression into invasion. The workflow uses the following building blocks, described in order of execution: 1. PhysiBoSS-Invasion For details on individual workflow steps, see the user documentation for each building block. [`GitHub repository`]() ## Contents ### Building Blocks The ``BuildingBlocks`` folder contains the script to install the Building Blocks used in the Cancer Invasion Workflow. ### Workflows The ``Workflow`` folder contains the workflows implementations. Currently contains the implementation using PyCOMPSs and Snakemake (in progress). ### Resources The ``Resources`` folder contains dataset files. ### Tests The ``Tests`` folder contains the scripts that run each Building Block used in the workflow for the given small dataset. They can be executed individually for testing purposes. ## Instructions ### Local machine This section explains the requirements and usage for the Cancer Invasion Workflow in a laptop or desktop computer. #### Requirements - [`permedcoe`](https://github.com/PerMedCoE/permedcoe) package - [PyCOMPSs](https://pycompss.readthedocs.io/en/stable/Sections/00_Quickstart.html) / [Snakemake](https://snakemake.readthedocs.io/en/stable/) - [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) #### Usage steps 1. Clone this repository: ```bash git clone https://github.com/PerMedCoE/cancer-invasion-workflow ``` 2. Install the Building Blocks required for the Cancer Invasion Workflow: ```bash cancer-invasion-workflow/BuildingBlocks/./install_BBs.sh ``` 3. Get the required Building Block images from the project [B2DROP](https://b2drop.bsc.es/index.php/f/444350): - Required images: - PhysiCell-Invasion.singularity The path where these files are stored **MUST be exported in the `PERMEDCOE_IMAGES`** environment variable. > :warning: **TIP**: These containers can be built manually as follows (be patient since some of them may take some time): 1. Clone the `BuildingBlocks` repository ```bash git clone https://github.com/PerMedCoE/BuildingBlocks.git ``` 2. Build the required Building Block images ```bash cd BuildingBlocks/Resources/images sudo singularity build PhysiCell-Invasion.sif PhysiCell-Invasion.singularity cd ../../.. ``` **If using PyCOMPSs in local PC** (make sure that PyCOMPSs in installed): 4. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflows/PyCOMPSs ``` 5. Execute `./run.sh` **If using Snakemake in local PC** (make sure that SnakeMake is installed): 4. Go to `Workflow/SnakeMake` folder ```bash cd Workflows/SnakeMake ``` 5. Execute `./run.sh` > **TIP**: If you want to run the workflow with a different dataset, please update the `run.sh` script setting the `dataset` variable to the new dataset folder and their file names. ### MareNostrum 4 This section explains the requirements and usage for the Cancer Invasion Workflow in the MareNostrum 4 supercomputer. #### Requirements in MN4 - Access to MN4 All Building Blocks are already installed in MN4, and the Cancer Invasion Workflow available. #### Usage steps in MN4 1. Load the `COMPSs`, `Singularity` and `permedcoe` modules ```bash export COMPSS_PYTHON_VERSION=3 module load COMPSs/3.1 module load singularity/3.5.2 module use /apps/modules/modulefiles/tools/COMPSs/libraries module load permedcoe ``` > **TIP**: Include the loading into your `${HOME}/.bashrc` file to load it automatically on the session start. This commands will load COMPSs and the permedcoe package which provides all necessary dependencies, as well as the path to the singularity container images (`PERMEDCOE_IMAGES` environment variable) and testing dataset (`CANCERINVASIONWORKFLOW_DATASET` environment variable). 2. Get a copy of the pilot workflow into your desired folder ```bash mkdir desired_folder cd desired_folder get_cancerinvasionworkflow ``` 3. Go to `Workflow/PyCOMPSs` folder ```bash cd Workflow/PyCOMPSs ``` 4. Execute `./launch.sh` This command will launch a job into the job queuing system (SLURM) requesting 2 nodes (one node acting half master and half worker, and other full worker node) for 20 minutes, and is prepared to use the singularity images that are already deployed in MN4 (located into the `PERMEDCOE_IMAGES` environment variable). It uses the dataset located into `../../Resources/data` folder. > :warning: **TIP**: If you want to run the workflow with a different dataset, please edit the `launch.sh` script and define the appropriate dataset path. After the execution, a `results` folder will be available with with Cancer Invasion Workflow results. ### Mahti or Puhti This section explains how to run the Cancer Invasion workflow on CSC supercomputers using SnakeMake. #### Requirements - Install snakemake (or check if there is a version installed using `module spider snakemake`) - Install workflow, using the same steps as for the local machine. With the exception that containers have to be built elsewhere. #### Steps 1. Go to `Workflow/SnakeMake` folder ```bash cd Workflow/SnakeMake ``` 2. Edit `launch.sh` with the correct partition, account, and resource specifications. 3. Execute `./launch.sh` > :warning: Snakemake provides a `--cluster` flag, but this functionality should be avoided as it's really not suited for HPC systems. ## License [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) ## Contact This software has been developed for the [PerMedCoE project](https://permedcoe.eu/), funded by the European Commission (EU H2020 [951773](https://cordis.europa.eu/project/id/951773)). ![](https://permedcoe.eu/wp-content/uploads/2020/11/logo_1.png "PerMedCoE") """ ; ns1:keywords "" ; ns1:license ; ns1:name "PerMedCoE Cancer Diagnosis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2023-12-20T17:45:10Z"^^ns1:Date ; ns1:dateModified "2023-12-20T17:49:26Z"^^ns1:Date ; ns1:description """![bacpage](https://raw.githubusercontent.com/CholGen/bacpage/split_into_command/.github/logo_dark.png){width=500} This repository contains an easy-to-use pipeline for the assembly and analysis of bacterial genomes using ONT long-read or Illumina short-read technology. # Introduction Advances in sequencing technology during the COVID-19 pandemic has led to massive increases in the generation of sequencing data. Many bioinformatics tools have been developed to analyze this data, but very few tools can be utilized by individuals without prior bioinformatics training. This pipeline was designed to encapsulate pre-existing tools to automate analysis of whole genome sequencing of bacteria. Installation is fast and straightfoward. The pipeline is easy to setup and contains rationale defaults, but is highly modular and configurable by more advance users. A successful run generates consensus sequences, typing information, phylogenetic tree, and quality control report. # Features We anticipate the pipeline will be able to perform the following functions: - [x] Reference-based assembly of Illumina paired-end reads - [x] *De novo* assembly of Illumina paired-end reads - [ ] *De novo* assembly of ONT long reads - [x] Run quality control checks - [x] Variant calling using [bcftools](https://github.com/samtools/bcftools) - [x] Maximum-likelihood phylogenetic inference of processed samples and background dataset using [iqtree](https://github.com/iqtree/iqtree2) - [x] MLST profiling and virulence factor detection - [x] Antimicrobial resistance genes detection - [ ] Plasmid detection # Installation 1. Install `miniconda` by running the following two command: ```commandline curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" bash Mambaforge-$(uname)-$(uname -m).sh ``` 2. Clone the repository: ```commandline git clone https://github.com/CholGen/bacpage.git ``` 3. Install and activate the pipeline's conda environment: ```commandline cd bacpage/ mamba env create -f environment.yaml mamba activate bacpage ``` 4. Install the `bacpage` command: ```commandline pip install . ``` 5. Test the installation: ```commandline bacpage -h bacpage version ``` These command should print the help and version of the program. Please create an issue if this is not the case. # Usage 0. Navigate to the pipeline's directory. 1. Copy the `example/` directory to create a directory specifically for each batch of samples. ```commandline cp example/ ``` 2. Place raw sequencing reads in the `input/` directory of your project directory. 3. Record the name and absolute path of raw sequencing reads in the `sample_data.csv` found within your project directory. 4. Replace the values `` and `` in `config.yaml` found within your project directory, with the absolute path of your project directory and pipeline directory, respectively. 5. Determine how many cores are available on your computer: ```commandline cat /proc/cpuinfo | grep processor ``` 6. From the pipeline's directory, run the entire pipeline on your samples using the following command: ```commandline snakemake --configfile /config.yaml --cores ``` This will generate a consensus sequence in FASTA format for each of your samples and place them in `/results/consensus_sequences/.masked.fasta`. An HTML report containing alignment and quality metrics for your samples can be found at `/results/reports/qc_report.html`. A phylogeny comparing your sequences to the background dataset can be found at `/results/phylogeny/phylogeny.tree` """ ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "Reference-based assembly with bacpage" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , ; ns1:dateCreated "2026-04-08T02:01:50Z"^^ns1:Date ; ns1:dateModified "2026-04-08T02:01:50Z"^^ns1:Date ; ns1:description "Comprehensive preprocessing for 10X Genomics CellPlex multiplexed single-cell RNA-seq data. Processes Cell Multiplexing Oligo (CMO) FASTQ files with CITE-seq-Count including required CellPlex-specific translation steps. Simultaneously processes gene expression FASTQ files with STARsolo and DropletUtils for alignment and cell filtering, and formats outputs for seamless import into Seurat/Scanpy (Read10X function)." ; ns1:isBasedOn ; ns1:keywords "single-cell" ; ns1:license ; ns1:name "fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 10 . a ns1:ComputerLanguage ; ns1:alternateName "WDL" ; ns1:identifier ; ns1:name "Workflow Description Language" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2024-01-09T17:12:32Z"^^ns1:Date ; ns1:dateModified "2024-01-09T17:12:32Z"^^ns1:Date ; ns1:description """# BACPAGE This repository contains an easy-to-use pipeline for the assembly and analysis of bacterial genomes using ONT long-read or Illumina short-read technology. Read the complete documentation and instructions for bacpage and each of its functions [here](https://cholgen.github.io/sequencing-resources/bacpage-command.html) # Introduction Advances in sequencing technology during the COVID-19 pandemic has led to massive increases in the generation of sequencing data. Many bioinformatics tools have been developed to analyze this data, but very few tools can be utilized by individuals without prior bioinformatics training. This pipeline was designed to encapsulate pre-existing tools to automate analysis of whole genome sequencing of bacteria. Installation is fast and straightfoward. The pipeline is easy to setup and contains rationale defaults, but is highly modular and configurable by more advance users. Bacpage has individual commands to generate consensus sequences, perform *de novo* assembly, construct phylogenetic tree, and generate quality control reports. # Features We anticipate the pipeline will be able to perform the following functions: - [x] Reference-based assembly of Illumina paired-end reads - [x] *De novo* assembly of Illumina paired-end reads - [ ] *De novo* assembly of ONT long reads - [x] Run quality control checks - [x] Variant calling using [bcftools](https://github.com/samtools/bcftools) - [x] Maximum-likelihood phylogenetic inference of processed samples and background dataset using [iqtree](https://github.com/iqtree/iqtree2) - [x] MLST profiling and virulence factor detection - [x] Antimicrobial resistance genes detection - [ ] Plasmid detection # Installation 1. Install `mamba` by running the following two command: ```commandline curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" bash Mambaforge-$(uname)-$(uname -m).sh ``` 2. Clone the bacpage repository: ```commandline git clone https://github.com/CholGen/bacpage.git ``` 3. Switch to the development branch of the pipeline: ```commandline cd bacpage/ git checkout -b split_into_command ``` 3. Install and activate the pipeline's conda environment: ```commandline mamba env create -f environment.yaml mamba activate bacpage ``` 4. Install the `bacpage` command: ```commandline pip install . ``` 5. Test the installation: ```commandline bacpage -h bacpage version ``` These command should print the help and version of the program. Please create an issue if this is not the case. # Updating 1. Navigate to the directory where you cloned the bacpage repository on the command line: ```commandline cd bacpage/ ``` 2. Activate the bacpage conda environment: ```commandline mamba activate bacpage ``` 3. Pull the lastest changes from GitHub: ```commandline git pull ``` 4. Update the bacpage conda environemnt: ```commandline mamba env update -f environment.yaml ``` 5. Reinstall the `bacpage` command: ```commandline pip install . ``` # Usage 0. Activate the bacpage conda environment: ```commandline mamba activate bacpage ``` 1. Create a directory specifically for the batch of samples you would like to analyze (called a project directory). ```commandline bacpage setup [your-project-directory-name] ``` 2. Place paired sequencing reads in the `input/` directory of your project directory. 3. From the pipeline's directory, run the reference-based assembly pipeline on your samples using the following command: ```commandline bacpage assemble [your-project-directory-name] ``` This will generate a consensus sequence in FASTA format for each of your samples and place them in `/results/consensus_sequences/.masked.fasta`. An HTML report containing alignment and quality metrics for your samples can be found at `/results/reports/qc_report.html`. """ ; ns1:image ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Phylogeny reconstruction using bacpage" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ONT raw reads collection" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-08T15:25:44Z"^^ns1:Date ; ns1:dateModified "2025-06-01T12:11:10Z"^^ns1:Date ; ns1:description "The workflow takes ONT reads collection, runs SeqKit and Nanoplot. The main outputs are a table and plots of raw reads stats." ; ns1:image ; ns1:input ; ns1:isPartOf , , , ; ns1:keywords "ONT, ERGA, DataQC" ; ns1:license ; ns1:name "ERGA DataQC ONT v2505 (WF0)" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Ploidy" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Trimmed Illumina collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/kmer length" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Merged Meryl DB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/genome_size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/max_depth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/transition_parameter" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-08T15:55:18Z"^^ns1:Date ; ns1:dateModified "2024-01-08T15:57:54Z"^^ns1:Date ; ns1:description "The workflow takes a trimmed Illumina paired-end reads collection, runs Meryl to create a K-mer database, Genomescope2 to estimate genome properties and Smudgeplot to estimate ploidy. The main results are K-mer ddatabase and genome profiling plots, tables, and values useful for downstream analysis. Default K-mer length and ploidy for Genomescope are 21 and 2, respectively. " ; ns1:image ; ns1:input , , ; ns1:isPartOf , ; ns1:keywords "name:PROFILING, ERGA, illumina" ; ns1:license ; ns1:name "ERGA Profiling Illumina v2311 (WF1)" ; ns1:output , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2020-11-19T13:29:51Z"^^ns1:Date ; ns1:dateModified "2023-06-27T12:39:11Z"^^ns1:Date ; ns1:description "Scipion is a workflow engine mostly for Cryo-Electron Microscopy image processing. In this extremely simple workflow, we load the Relion 3.0 tutorial data and process it to 2.9A resolution." ; ns1:keywords "Electron microscopy, image processing, single particle analysis" ; ns1:license ; ns1:name "Scipion Tutorial example reaching 2.9A resolution" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T12:44:38Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:39:57Z"^^ns1:Date ; ns1:description "Dating the most recent common ancestor (MRCA) of SARS-CoV-2. The workflow is used to extract full length sequences of SARS-CoV-2, tidy up their names in FASTA files, produce a multiple sequences alignment and compute a maximum likelihood tree. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - MRCA analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GenBank file " . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Paired Collection (fastqsanger)" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-01-10T01:19:39Z"^^ns1:Date ; ns1:dateModified "2024-01-10T01:20:12Z"^^ns1:Date ; ns1:description """# Pangenome databases provide superior host removal and mycobacteria classification from clinical metagenomic data > Hall, M, Coin, L., Pangenome databases provide superior host removal and mycobacteria classification from clinical metagenomic data. bioRxiv 2023. doi: [10.1101/2023.09.18.558339][doi] Benchmarking different ways of doing read (taxonomic) classification, with a focus on removal of contamination and classification of _M. tuberculosis_ reads. This repository contains the code and snakemake pipeline to build/download the databases, obtain all results from [the paper][doi], along with accompanying configuration files. Custom databases have all been uploaded to Zenodo, along with the simulated reads: - Nanopore simulated metagenomic reads - - Illumina simulated metagenomic reads - - Nanopore and Illumina artificial real reads - - Kraken2 database built from the Human Pangenome Reference Consortium genomes - - Kraken2 database built from the kraken2 Human library - - Kraken2 database built from a *Mycobacterium* representative set of genomes - - A (fasta) database of representative genomes from the *Mycobacterium* genus - - A (fasta) database of *M. tuberculosis* genomes from a variety of lineages - - The fasta file built from the [Clockwork](https://github.com/iqbal-lab-org/clockwork) decontamination pipeline - ## Example usage We provide some usage examples showing how to download the databases and then use them on your reads. ### Human read removal The method we found to give the best balance of runtime, memory usage, and precision and recall was kraken2 with a database built from the Human Pangenome Reference Consortium genomes. This example has been wrapped into a standalone tool called [`nohuman`](https://github.com/mbhall88/nohuman/) which takes a fastq as input and returns a fastq with human reads removed. #### Download human database ``` mkdir HPRC_db/ cd HPRC_db URL="https://zenodo.org/record/8339732/files/k2_HPRC_20230810.tar.gz" wget "$URL" tar -xzf k2_HPRC_20230810.tar.gz rm k2_HPRC_20230810.tar.gz ``` #### Run kraken2 with HPRC database You'll need [kraken2](https://github.com/DerrickWood/kraken2) installed for this step. ``` kraken2 --threads 4 --db HPRC_db/ --output classifications.tsv reads.fq ``` If you are using Illumina reads, a slight adjustment is needed ``` kraken2 --paired --threads 4 --db HPRC_db/ --output classifications.tsv reads_1.fq reads_2.fq ``` #### Extract non-human reads You'll need [seqkit](https://github.com/shenwei356/seqkit) installed for this step For Nanopore data ``` awk -F'\\t' '$1=="U" {print $2}' classifications.tsv | \\ seqkit grep -f - -o reads.depleted.fq reads.fq ``` For Illumina data ``` awk -F'\\t' '$1=="U" {print $2}' classifications.tsv > ids.txt seqkit grep --id-regexp '^(\\S+)/[12]' -f ids.txt -o reads_1.depleted.fq reads_1.fq seqkit grep --id-regexp '^(\\S+)/[12]' -f ids.txt -o reads_2.depleted.fq reads_2.fq ``` ### *M. tuberculosis* classification/enrichment For this step we recommend either [minimap2](https://github.com/lh3/minimap2) or kraken2 with a *Mycobacterium* genus database. We leave it to the user to decide which approach they prefer based on the results in our manuscript. #### Download databases ``` mkdir Mycobacterium_db cd Mycobacterium_db # download database for use with minimap2 URL="https://zenodo.org/record/8339941/files/Mycobacterium.rep.fna.gz" wget "$URL" IDS_URL="https://zenodo.org/record/8343322/files/mtb.ids" wget "$IDS_URL" # download kraken database URL="https://zenodo.org/record/8339822/files/k2_Mycobacterium_20230817.tar.gz" wget "$URL" tar -xzf k2_Mycobacterium_20230817.tar.gz rm k2_Mycobacterium_20230817.tar.gz ``` #### Classify reads **minimap2** ``` # nanopore minimap2 --secondary=no -c -t 4 -x map-ont -o reads.aln.paf Mycobacterium_db/Mycobacterium.rep.fna.gz reads.depleted.fq # illumina minimap2 --secondary=no -c -t 4 -x sr -o reads.aln.paf Mycobacterium_db/Mycobacterium.rep.fna.gz reads_1.depleted.fq reads_2.depleted.fq ``` **kraken2** ``` # nanopore kraken2 --db Mycobacterium_db --threads 4 --report myco.kreport --output classifications.myco.tsv reads.depleted.fq # illumina kraken2 --db Mycobacterium_db --paired --threads 4 --report myco.kreport --output classifications.myco.tsv reads_1.depleted.fq reads_2.depleted.fq ``` #### Extract *M. tuberculosis* reads **minimap2** ``` # nanopore grep -Ff Mycobacterium_db/mtb.ids reads.aln.paf | cut -f1 | \\ seqkit grep -f - -o reads.enriched.fq reads.depleted.fq # illumina grep -Ff Mycobacterium_db/mtb.ids reads.aln.paf | cut -f1 > keep.ids seqkit grep -f keep.ids -o reads_1.enriched.fq reads_1.depleted.fq seqkit grep -f keep.ids -o reads_2.enriched.fq reads_2.depleted.fq ``` **kraken2** We'll use the [`extract_kraken_reads.py` script](https://github.com/jenniferlu717/KrakenTools#extract_kraken_readspy) for this ``` # nanopore python extract_kraken_reads.py -k classifications.myco.tsv -1 reads.depleted.fq -o reads.enriched.fq -t 1773 -r myco.kreport --include-children # illumina python extract_kraken_reads.py -k classifications.myco.tsv -1 reads_1.depleted.fq -2 reads_2.depleted.fq -o reads_1.enriched.fq -o2 reads_2.enriched.fq -t 1773 -r myco.kreport --include-children ``` [doi]: https://doi.org/10.1101/2023.09.18.558339 """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.700.2" ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Pangenome databases provide superior host removal and mycobacteria classification from clinical metagenomic data" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Estimated genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GFA contigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Meryl database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ONT reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Transition parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/max depth" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/output" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-09T10:40:06Z"^^ns1:Date ; ns1:dateModified "2024-01-09T10:44:51Z"^^ns1:Date ; ns1:description "The workflow takes a trimmed Illumina WGS paired-end reads collection, Collapsed contigs, and the values for transition parameter and max coverage depth (calculated from WF1) to run Purge_Dups. It produces purged Collapsed contigs assemblies, and runs all the QC analysis (gfastats, BUSCO, and Merqury). " ; ns1:image ; ns1:input , , , , , , ; ns1:isPartOf , ; ns1:keywords "name:ASSEMBLY+QC, ERGA, illumina" ; ns1:license ; ns1:name "ERGA ONT+Illumina Collapsed Purge+QC v2311 (WF3)" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "ERGA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC F trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "HiC R trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged GFA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_2" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-09T11:00:47Z"^^ns1:Date ; ns1:dateModified "2024-01-09T11:00:47Z"^^ns1:Date ; ns1:description "The workflow takes trimmed HiC forward and reverse reads, and one assembly (e.g.: Hap1 or Pri or Collapsed) to produce a scaffolded assembly using YaHS. It also runs all the QC analyses (gfastats, BUSCO, and Merqury). " ; ns1:image ; ns1:input , , , , , ; ns1:isPartOf , ; ns1:keywords "name:ASSEMBLY+QC, ERGA, HiC" ; ns1:license ; ns1:name "ERGA HiC Collapsed Scaffolding+QC YaHS v2311 (WF4)" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Python" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-09T13:04:17Z"^^ns1:Date ; ns1:dateModified "2024-01-18T18:24:01Z"^^ns1:Date ; ns1:description """# Framework for construction of phylogenetic networks on High Performance Computing (HPC) environment ## Introduction Phylogeny refers to the evolutionary history and relationship between biological lineages related by common descent. Reticulate evolution refers to the origination of lineages through the complete or partial merging of ancestor lineages. Networks may be used to represent lineage independence events in non-treelike phylogenetic processes. The methodology for reconstructing networks is still in development. Here we explore two methods for reconstructing rooted explicit phylogenetic networks, PhyloNetworks and Phylonet, which employ computationally expensive and time consuming algorithms. The construction of phylogenetic networks follows a coordinated processing flow of data sets analyzed and processed by the coordinated execution of a set of different programs, packages, libraries or pipelines, called workflow activities. In view of the complexity in modeling network experiments, the present work introduces a workflow for phylogenetic network analyses coupled to be executed in High-Performance Computing (HPC) environments. The workflow aims to integrate well-established software, pipelines and scripts, implementing a challenging task since these tools do not consistently profit from the HPC environment, leading to an increase in the expected makespan and idle computing resources. ## Requirements 1. Python >= 3.8 1. Biopython >= 1.75 2. Pandas >= 1.3.2 3. Parsl >= 1.0 3. Raxml >= 8.2.12 4. Astral >= 5.7.1 5. SnaQ (PhyloNetworks) >= 0.13.0 6. MrBayes >= 3.2.7a 7. BUCKy >= 1.4.4 8. Quartet MaxCut >= 2.10 9. PhyloNet >= 3.8.2 10. Julia >= 1.4.1 11. IQTREE >= 2.0 ## How to use ### Setting up the framework The framework uses a file to get all the needed parameters. For default it loads the file *default.ini* in the config folder, but you can explicitly load other files using the argument ``-s name_of_the_file``, *e.g.* ``-s config/test.ini``. * Edit *parl.env* with the environment variables you may need, such as modules loadeds in SLURM * Edit *work.config* with the directories of your phylogeny studies (the framework receives as input a set of homologous gene alignments of species in the nexus format). * Edit *default.ini* with the path for each of the needed softwares and the parameters of the execution provider. For default, the execution logs are created in the ``runinfo`` folder. To change it you can use the `-r folder_path` parameter. #### Contents of the configuration file * General settings ```ini [GENERAL] ExecutionProvider = SLURM ScriptDir = ./scripts Environ = config/parsl.env Workload = config/work.config NetworkMethod = MP TreeMethod = RAXML BootStrap = 1000 ``` 1. The framework can be executed in a HPC environment using the Slurm resource manager using the parameter ``ExecutionProvider`` equals to ``SLURM`` or locally with ``LOCAL``. 2. The path of the scripts folder is assigned in ``ScriptDir``. It's recommended to use the absolute path to avoid errors. 3. The ``Environ`` parameter contains the path of the file used to set environment variables. More details can be seen below. 4. In ``Workload`` is the path of the experiments that will be performed. 5. ``NetworkMethod`` and ``TreeMethod`` are the default network and tree methods that will be used to perform the workloads' studies. 6. ``Bootstrap`` is the parameter used in all the software that use bootstrap (RAxML, IQTREE and ASTRAL) * Workflow execution settings When using SLURM, these are the needed parameters: ```ini [WORKFLOW] Monitor = False PartCore = 24 PartNode = 1 Walltime = 00:20:00 ``` 1. ``Monitor`` is a parameter to use parsl's monitor module in HPC environment. It can be *true* or *false*. If you want to use it, it's necessary to set it as *true* and manually change the address in ``infra_manager.py`` 2. If you are using it in a HPC environment (using SLURM), the framework is going to submit in a job. ``PartCore`` is the number of cores of the node; ``PartNode`` is the number of nodes of the partition; and the ``Walltime`` parameter is the maximum amount of time the job will be able to run. However, if the the desired execution method is the LocalProvider, _i.e._ the execution is being performed in your own machine, only these parameters are necessary: ```ini [WORKFLOW] Monitor = False MaxCore = 6 CoresPerWorker = 1 ``` * RAxML settings ```ini [RAXML] RaxmlExecutable = raxmlHPC-PTHREADS RaxmlThreads = 6 RaxmlEvolutionaryModel = GTRGAMMA --HKY85 ``` * IQTREE settings ```ini [IQTREE] IqTreeExecutable = iqtree2 IqTreeEvolutionaryModel = TIM2+I+G IqTreeThreads = 6 ``` * ASTRAL settings ```ini [ASTRAL] AstralExecDir = /opt/astral/5.7.1 AstralJar = astral.jar ``` * PhyloNet settings ```ini [PHYLONET] PhyloNetExecDir = /opt/phylonet/3.8.2/ PhyloNetJar = PhyloNet.jar PhyloNetThreads = 6 PhyloNetHMax = 3 PhyloNetRuns = 5 ``` * SNAQ settings ```ini [SNAQ] SnaqThreads = 6 SnaqHMax = 3 SnaqRuns = 3 ``` * Mr. Bayes settings ```ini [MRBAYES] MBExecutable = mb MBParameters = set usebeagle=no beagledevice=cpu beagleprecision=double; mcmcp ngen=100000 burninfrac=.25 samplefreq=50 printfreq=10000 diagnfreq=10000 nruns=2 nchains=2 temp=0.40 swapfreq=10 ``` * Bucky settings ```ini [BUCKY] BuckyExecutable = bucky MbSumExecutable = mbsum ``` * Quartet MaxCut ```ini QUARTETMAXCUT] QmcExecDir = /opt/quartet/ QmcExecutable = find-cut-Linux-64 ``` #### Workload file For default the workload file is ``work.config`` in the *config* folder. The file contains the absolute paths of the experiment's folders. ``` /home/rafael.terra/Biocomp/data/Denv_1 ``` You can comment folders using the # character in the beginning of the path. *e. g.* ``#/home/rafael.terra/Biocomp/data/Denv_1``. That way the framework won't read this path. You can also run a specific flow for a path using ``@TreeMethod|NetworkMethod`` in the end of a path. Where *TreeMethod* can be RAXML, IQTREE or MRBAYES and *NetworkMethod* can be MPL or MP (case sensitive). The supported flows are: ``RAXML|MPL``, ``RAXML|MP``, ``IQTREE|MPL``, ``IQTREE|MP`` and ``MRBAYES|MPL``. For example: ``` /home/rafael.terra/Biocomp/data/Denv_1@RAXML|MPL ``` #### Environment file The environment file contains all the environment variables (like module files used in SLURM) used during the framework execution. Example: ```sh module load python/3.8.2 module load raxml/8.2_openmpi-2.0_gnu module load java/jdk-12 module load iqtree/2.1.1 module load bucky/1.4.4 module load mrbayes/3.2.7a-OpenMPI-4.0.4 source /scratch/app/modulos/julia-1.5.1.sh ``` #### Experiment folder Each experiment folder needs to have a *input folder* containing a *.tar.gz* compressed file and a *.json* with the following content. **The framework considers that there is only one file of each extension in the input folder**. ```json { "Mapping":"", "Outgroup":"" } ``` Where ``Mapping`` is a direct mapping of the taxon, when there are multiple alleles per species, in the format ``species1:taxon1,taxon2;species2:taxon3,taxon4`` *(white spaces are not supported)* and ``Outgroup`` is the taxon used to root the network. The Mapping parameter is optional (although it has to be in the json file without value), but the outgroup is obligatory. It's important to say that the flow *MRBAYES|MPL* doesn't support multiple alleles per species. Example: ```json { "Mapping": "dengue_virus_type_2:FJ850082,FJ850088,JX669479,JX669482,JX669488,KP188569;dengue_virus_type_3:FJ850079,FJ850094,JN697379,JX669494;dengue_virus_type_1:FJ850073,FJ850084,FJ850093,JX669465,JX669466,JX669475,KP188545,KP188547;dengue_virus_type_4:JN559740,JQ513337,JQ513341,JQ513343,JQ513344,JQ513345,KP188563,KP188564;Zika_virus:MH882543", "Outgroup": "MH882543" } ``` ## Running the framework * In a local machine: After setting up the framework, just run ``python3 parsl_workflow.py``. * In a SLURM environment: Create an submition script that inside contains: ``python3 parsl_workflow.py``. ```sh #!/bin/bash #SBATCH --time=15:00:00 #SBATCH -e slurm-%j.err #SBATCH -o slurm-%j.out module load python/3.9.6 cd /path/to/biocomp python3 parsl_workflow.py ``` The framework is under heavy development. If you notice any bug, please create an issue here on GitHub. ### Running in a DOCKER container The framework is also available to be used in Docker. It can be built from source or pushed from DockerHub. #### Building it from the source code Adapt the default settings file ``config/default.ini`` according to your machine, setting the number of threads and bootstrap. After that, run ``docker build -t hp2net .`` in the project's root folder. #### Downloading it from Dockerhub The docker image can also be downloaded from [Docker hub](https://hub.docker.com/repository/docker/rafaelstjf/hp2net/general). To do that, just run the command ``docker pull rafaelstjf/hp2net:main`` #### Running The first step to run the framework is to setup your dataset. To test if the framework is running without problems in your machine, you can use the [example datasets](example_data). ![Alt text](docs/example_data.png) Extracting the ``example_data.zip`` file, a new folder called ``with_outgroup`` is created. This folder contain four datasets of DENV sequences. The next step is the creation of the settings and workload files. For the settings file, download the [default.ini](config/default.ini) from this repository and change it to you liking (the path of all software are already configured to run on docker). The workload file is a text file containing the absolute path of the datasets, followed by the desired pipeline, as shown before in this document. Here for example purposes, the ``input.txt`` file was created. ![Alt text](docs/example_files.png) With all the files prepared, the framework can be executed from the ``example_data`` folder as following: ``docker run --rm -v $PWD:$PWD rafaelstjf/hp2net:main -s $PWD/default.ini -w $PWD/input.txt`` **Important:** the docker doesn't save your logs, for that add the parameter: ``-r $PWD/name_of_your_log_folder``. --- If you are running it on **Santos Dumont Supercomputer**, both downloading and execution of the docker container need to be performed from a submission script and executed using ``sg docker -c "sbatch script.sh"``. The snippet below shows an example of submission script. ```sh #!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks-per-node=24 #SBATCH -p cpu_small #SBATCH -J Hp2NET #SBATCH --exclusive #SBATCH --time=02:00:00 #SBATCH -e slurm-%j.err #SBATCH -o slurm-%j.out DIR='/scratch/pcmrnbio2/rafael.terra/WF_parsl/example_data' docker pull rafaelstjf/hp2net:main docker run --rm -v $DIR:$DIR rafaelstjf/hp2net:main -s ${DIR}/sdumont.ini -w ${DIR}/entrada.txt -r ${DIR}/logs ``` ## If you use it, please cite Terra, R., Coelho, M., Cruz, L., Garcia-Zapata, M., Gadelha, L., Osthoff, C., ... & Ocana, K. (2021, July). Gerência e Análises de Workflows aplicados a Redes Filogenéticas de Genomas de Dengue no Brasil. In *Anais do XV Brazilian e-Science Workshop* (pp. 49-56). SBC. **Also cite all the coupled software!** """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.703.1" ; ns1:keywords "Bioinformatics, Parsl, phylogenetics, HPC" ; ns1:license ; ns1:name "HP2NET - Framework for Construction of Phylogenetic Networks on High Performance Computing (HPC) Environment" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2024-01-24T10:38:28Z"^^ns1:Date ; ns1:dateModified "2024-02-05T10:09:43Z"^^ns1:Date ; ns1:description """![workflow](https://github.com/naturalis/barcode-constrained-phylogeny/actions/workflows/python-package-conda.yml/badge.svg) [![License: Apache-2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10519081.svg)](https://doi.org/10.5281/zenodo.10519081) ![Logo](https://github.com/naturalis/barcode-constrained-phylogeny/blob/main/doc/logo-small.png?raw=true) # Bactria: BarCode TRee Inference and Analysis This repository contains code and data for building very large, topologically-constrained barcode phylogenies through a divide-and-conquer strategy. Such trees are useful as reference materials for curating barcode data by detecting rogue terminals (indicating incorrect taxonomic annotation) and in the comparable calculation of alpha and beta biodiversity metrics across metabarcoding assays. The input data for the approach we develop here currently comes from BOLD data dumps. The international database [BOLD Systems](https://www.boldsystems.org/index.php) contains DNA barcodes for hundreds of thousands of species, with multiple barcodes per species. The data dumps we use here are TSV files whose columns conform to the nascent BCDM (barcode data model) vocabulary. As such, other data sources that conform to this vocabulary could in the future be used as well, such as [UNITE](https://unite.ut.ee/). Theoretically, such data could be filtered and aligned per DNA marker to make phylogenetic trees. However, there are two limiting factors: building very large phylogenies is computationally intensive, and barcodes are not considered ideal for building big trees because they are short (providing insufficient signal to resolve large trees) and because they tend to saturate across large patristic distances. ![concept](https://github.com/naturalis/barcode-constrained-phylogeny/blob/main/doc/concept.png) Both problems can be mitigated by using the [Open Tree of Life](https://tree.opentreeoflife.org/opentree/argus/opentree13.4@ott93302) as a further source of phylogenetic signal. The BOLD data can be split into chunks that correspond to Open Tree of Life clades. These chunks can be made into alignments and subtrees. The OpenTOL can be used as a constraint in the algorithms to make these. The chunks are then combined in a large synthesis by grafting them on a backbone made from exemplar taxa from the subtrees. Here too, the OpenTOL is a source of phylogenetic constraint. In this repository this concept is developed for both animal species and plant species. ## Installation The pipeline and its dependencies are managed using conda. On a linux or osx system, you can follow these steps to set up the `bactria` Conda environment using an `environment.yml` file and a `requirements.txt` file: 1. **Clone the Repository:** Clone the repository containing the environment files to your local machine: ```bash git clone https://github.com/naturalis/barcode-constrained-phylogeny.git cd barcode-constrained-phylogeny ``` 2. **Create the Conda Environment:** Create the bactria Conda environment using the environment.yml file with the following command: ```bash conda env create -f workflow/envs/environment.yml ``` This command will create a new Conda environment named bactria with the packages specified in the environment.yml file. This step is largely a placeholder because most of the dependency management is handled at the level of individual pipeline steps, which each have their own environment specification. 3. **Activate the Environment:** After creating the environment, activate it using the conda activate command: ```bash conda activate bactria ``` 4. **Verify the Environment:** Verify that the bactria environment was set up correctly and that all packages were installed using the conda list command: ```bash conda list ``` This command will list all packages installed in the active conda environment. You should see all the packages specified in the environment.yml file and the requirements.txt file. ## How to run The pipeline is implemented using snakemake, which is available within the conda environment that results from the installation. Important before running the snakemake pipeline is to change in [config/config.yaml](config/config.yaml) the number of threads available on your computer. Which marker gene is used in the pipeline is also specified in the config.yaml (default COI-5P). Prior to execution, the BOLD data package to use (we used the [release of 30 December 2022](https://www.boldsystems.org/index.php/datapackage?id=BOLD_Public.30-Dec-2022)) must be downloaded manually and stored in the [resources/](resources/) directory. If a BOLD release from another date is used the file names in config.yaml need to be updated. How to run the entire pipeline: ```bash snakemake -j {number of threads} --use-conda ``` Snakemake rules can be performed separately: ```bash snakemake -R {Rule} -j {number of threads} --use-conda ``` Enter the same number at {number of threads} as you filled in previously in src/config.yaml. In {Rule} insert the rule to be performed. Here is an overview of all the rules in the Snakefile: ![graphviz (1)](https://github.com/naturalis/barcode-constrained-phylogeny/blob/main/doc/dag.svg) (zoomed view is available [here](https://raw.githubusercontent.com/naturalis/barcode-constrained-phylogeny/main/doc/dag.svg)) ## Repository layout Below is the top-level layout of the repository. This layout is in line with [community standards](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html) and must be adhered to. All of these subfolders contains further explanatory READMEs to explain their contents in more detail. - [config](config/) - configuration files - [doc](doc/) - documentation and background literature - [logs](logs/) - where log files are written during pipeline runtime - [resources](resources/) - external data resources (from BOLD and OpenTree) are downloaded here - [results](results/) - intermediate and final results are generated here - [workflow](workflow/) - script source code and driver snakefile ## License © 2023 Naturalis Biodiversity Center Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at [http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0) Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.""" ; ns1:image ; ns1:keywords "Bioinformatics, Python, Snakemake, phylogenetics" ; ns1:license ; ns1:name "Bactria: BarCode TRee Inference and Analysis" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-01-24T14:45:26Z"^^ns1:Date ; ns1:dateModified "2024-08-02T14:19:24Z"^^ns1:Date ; ns1:description "Lysozyme in water full COMPSs application, using dataset_small" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Lysozyme in water, using dataset_small, data_persistence True" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-01-24T15:02:34Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:51Z"^^ns1:Date ; ns1:description "Lysozyme in water full COMPSs application, using dataset_small" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.708.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, non_data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Lysozyme in water full version, using dataset_small, data_persistence False" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-25T14:22:01Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:47Z"^^ns1:Date ; ns1:description "Wordcount merge version COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.709.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, non_data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Wordcount merge version, data_persistence False" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-25T14:26:17Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:43Z"^^ns1:Date ; ns1:description "Wordcount reduce version COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.710.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Wordcount reduce version, data_persistence True" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-25T14:29:25Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:36Z"^^ns1:Date ; ns1:description "Cholesky factorisation COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.711.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Cholesky factorisation, SIZE 4, BSIZE 512, data_persistence True" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-25T14:32:38Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:40Z"^^ns1:Date ; ns1:description "K-means COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.712.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "K-means data_persistence True" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-01-25T14:37:39Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:33Z"^^ns1:Date ; ns1:description "Cluster Comparison COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.713.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Cluster Comparison data_persistence True" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-01-25T16:17:10Z"^^ns1:Date ; ns1:dateModified "2026-03-16T15:55:28Z"^^ns1:Date ; ns1:description "Lysozyme in water sample COMPSs application" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.714.1" ; ns1:image ; ns1:isPartOf ; ns1:keywords "PyCOMPSs, data_persistence, Marenostrum IV, Supercomputer" ; ns1:license ; ns1:name "Lysozyme in water sample, dataset_small, data_persistence True, nct00014 username, 4 workers" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/BAM files with CB and UB" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/filtered matrices in bundle" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/gtf file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/velocyto loom" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-02-20T03:01:57Z"^^ns1:Date ; ns1:dateModified "2026-02-20T03:01:57Z"^^ns1:Date ; ns1:description "Processes 10X Genomics single-cell RNA-seq data using Velocyto to quantify spliced and unspliced transcript counts for RNA velocity analysis. Automatically extracts cell barcodes from standard 10X bundled output and generates a loom file containing separate counts for spliced exons, unspliced introns, and ambiguous regions. Enables downstream trajectory inference and cellular dynamics analysis." ; ns1:input , , ; ns1:isBasedOn ; ns1:keywords "name:single-cell" ; ns1:license ; ns1:name "velocyto/Velocyto-on10X-from-bundled" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 3 . a ns1:Person ; ns1:name "Marta Martinez" . a ns1:Person ; ns1:name "Roberto Melero" . a ns1:ComputerLanguage ; ns1:name "Scipion" ; ns1:url . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Nextflow" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-02-02T05:15:00Z"^^ns1:Date ; ns1:dateModified "2024-02-08T00:03:01Z"^^ns1:Date ; ns1:description """![](https://github.com/AusARG/pipesnake/blob/main/docs/images/pipesnake_Logo.png)   Welcome to the *pipesnake*. Let's get started. --- # Introduction **pipesnake** is a bioinformatics best-practice analysis pipeline for phylogenomic reconstruction starting from short-read 'second-generation' sequencing data. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. --- # Motivation + Project Background We developed *pipesnake* as part of the [***Aus***tralian ***A***mphibian and ***R***eptile ***G***enomics](https://ausargenomics.com/) (*AusARG*) initiative. **AusARG** is a national collaborative project aiming to facilitate the development of genomics resources for Australia's unique amphibian and reptile fauna. This pipeline was developed specifically as part of the *AusARG Phylogenomics Working Group* with the goal of collecting a consistent set of phylogenomic data for all of Australia's frogs and reptiles, under similar assembly, alignment, and tree estimation procedures. *pipesnake* is however, applicable to much broader phylogenomic questions, and is appropriate for processing exon-capture or transcriptomic data, so long as the **input is second-generation (short-read) data**. """ ; ns1:keywords "" ; ns1:license ; ns1:name "pipesnake" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Collection of Pacbio Data" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Email adress" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genetic Code" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species name (latin name)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compressed Mitogenome" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mitogenome annotation: GenBank" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mitogenome annotation: Image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mitogenome coverage: Image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Mitogenome: Contigs Statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name for report" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2025-08-29T02:01:49Z"^^ns1:Date ; ns1:dateModified "2025-12-12T02:01:28Z"^^ns1:Date ; ns1:description "Generate mitochondrial assembly based on PacBio HiFi reads. Part of the VGP suite, it can be run at any time independently of the other workflows. This workflow uses MitoHiFi and a mitochondrial reference to assemble the mitochondrial genome from PacBio reads. You do not need to provide the reference yourself, only the Latin name of the species." ; ns1:input , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Mitogenome-assembly-VGP0/main" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , ; ns1:dateCreated "2020-11-19T13:34:09Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:46:12Z"^^ns1:Date ; ns1:description "Continuous flexibility analysis of SARS-CoV-2 Spike prefusion structures" ; ns1:image ; ns1:keywords "covid-19, image processing, bioimaging" ; ns1:license ; ns1:name "Scipion workflow for Cryo electron microscopy of SARS-CoV-2 stabilized spike in prefusion state" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Snakemake" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2024-02-13T09:44:35Z"^^ns1:Date ; ns1:dateModified "2024-02-13T09:44:35Z"^^ns1:Date ; ns1:description """## EBP-Nor Genome Assembly pipeline This repository contains the EBP-Nor genome assembly pipeline. This pipeline is implemented in snakemake. This pipeline is developed to create haplotype-resolved genome assemblies from PacBio HiFi reads and HiC reads, and is primarly designed for diploid eukaryotic organisms. The pipeline is designed to work on a linux cluster with slurm as workload manager. ## Requirements & Setup Some software need to be configured/installed before the pipeline can be run ### Conda setup Most required software, including snakemake itself, can be installed using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Once conda is installed, you can create a new environment containing most necessary software from the provided asm_pipeline.yaml file as follows: ```shell conda create -n asm_pipeline --file=worfklow/envs/asm_pipeline.yaml ``` ### Other software setup The following software need to be installed manually: - KMC v3.1.1 (https://github.com/tbenavi1/KMC) - HiFiAdapterFilt (https://github.com/sheinasim/HiFiAdapterFilt) - Oatk (https://github.com/c-zhou/oatk) - OatkDB (https://github.com/c-zhou/OatkDB) - NCBI FCS-Adaptor (https://github.com/ncbi/fcs/wiki/FCS-adaptor) - NCBI FCS-GX (https://github.com/ncbi/fcs/wiki/FCS-GX) Please refer to their respective installation instructions to properly install them. You will need to privide the installation paths of these software to the config file (see Parameter section). ### BUSCO database setup As in general, computing nodes are not connected to the internet, BUSCO lineage datasets need to be downloaded manually before running the pipeline. This can easily be done by running ```shell busco --download eukaryota ``` You will need to specify the folder where you downloaded the busco lineages in the config file (see Parameter section). ### Data This pipeline is created for using PacBio HiFi reads together with paired-end Hi-C data. You will need to specify the absolute paths to these files in the config file (see Parameters section). ### Parameters The necessary config files for running the pipeline can be found in the config folder. General snakemake and cluster submission parameters are defined in ```config/config.yaml```, data- and software-specfic parameters are defined in ```config/asm_params.yaml```. First, define the paths of the input files you want to use: - pacbio: path to the location of the PacBio HiFi reads (```.fastq.gz```) - hicF and hicR: path to the forward and reverse HiC reads respectively For software not installed by conda, the installation path needs to be provided to the Snakemake pipeline by editing following parameters in the ```config/asm_params.yaml```: - Set the "adapterfilt_install_dir" parameter to the installation path of HiFiAdapterFilt - Set the "KMC_path" parameter to the installation path of KMC - Set the "oatk_dir" parameter to the installation path of oatk - Set the "oatk_db" parameter to the directory where you downloaded the oatk_db files - Set the "fcs_path" parameter to the location of the ```run_fcsadaptor.sh``` and ```fcs.py``` scripts - Set the "fcs_adaptor_image" and "fcs_gx_image" parameters to the paths to the ```fcs-adaptor.sif``` and ```fcs-gx.sif``` files respectively - Set the "fcs_gx_db" parameter to the path of the fcs-gx database A couple of other parameters need to be verified as well in the config/asm_params.yaml file before running the pipeline: - The location of the input data (```input_dir```) should be set to the folder containing the input data. - The location of the downloaded busco lineages (```busco_db_dir```) should be set to the folder containing the busco lineages files downloaded earlier - The required BUSCO lineage for running the BUSCO analysis needs to set (```busco_lineage``` parameter). Run ```busco --list-datasets``` to get an overview of all available datasets. - The required oatk lineage for running organelle genome assembly (```oatk_lineage``` parameter). Check https://github.com/c-zhou/OatkDB for an overview of available lineages. - A boolean value wether the species is plant (for plastid prediction) or not (```oatk_isPlant```; set to either True or False) - The NCBI taxid of your species, required for the decontamination step (```taxid``` parameter) ## Usage and run modes Before running, make sure to activate the conda environment containing the necessary software: ```conda activate asm_assembly```. To run the pipeline, run the following command: ``` snakemake --profile config/ --configfile config/asm_params.yaml --snakefile workflow/Snakefile {run_mode} ``` If you invoke the snakemake command in another directory than the one containing the ```workflow``` and ```config``` folders, or if the config files (```config.yaml``` and ```asm_params.yaml```) are in another location, you need to specify their correct paths on the command line. The workflow parameters can be modified in 3 ways: - Directly modifying the ```config/asm_parameters.yaml``` file - Overriding the default parameters on the command line: ```--config parameter=new_value``` - Overriding the default parameters using a different yaml file: ```--configfile path_to_parameters.yaml``` The pipeline has different runing modes, and the run mode should always be the last argument on the command line: - "all" (default): will run the full workflow including pre-assembly (genomescope & smudgeplot), assembly, scaffolding, decontamination, and organelle assembly - "pre_assembly": will run only the pre-assembly steps (genomescope & smudgeplot) - "assembly": will filter the HiFi reads and assemble them using hifiasm (also using the Hi-C reads), and run busco - "scaffolding": will run all steps necessary for scaffolding (filtering, assembly, HiC filtering, scaffolding, busco), but without pre-assembly - "decontamination": will run assembly, scaffolding, and decontamination, but without pre-assembly and busco analyses - "organelles": will run only organnelle genome assembly ## Output All generated output will be present in the "results" directory, which will be created in the folder from where you invoke the snakemake command. This results directory contains different subdirectories related to the different steps in the assembly: - results/pre_assembly: genomescope and smudgeplot output (each in its own subfolder) - results/assembly: Hifiasm assembly output and corresponding busco results - results/scaffolding: scaffolding output, separated in two folders: - meryl: meryl databases used for filtering HiC reads - yahs: scaffolding output, including final scaffolds and their corresponding busco results - results/decontamination: decontamination output of the final scaffolded assembly - results/organelles: assembled organellar genomes Additionally, a text file containing all software versions will be created in the specified input directory. The log files of the different steps in the workflow can be found in the ```logs``` directory that will be created.""" ; ns1:image ; ns1:keywords "" ; ns1:license ; ns1:name "EBP-Nor Genome Assembly Pipeline" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "COMPSs" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-14T14:05:16Z"^^ns1:Date ; ns1:dateModified "2024-08-09T07:14:45Z"^^ns1:Date ; ns1:description """**Name:** Random Forest **Contact Person**: support-compss@bsc.es **Access Level**: public **License Agreement**: Apache2 **Platform**: COMPSs **Machine**: MareNostrum4 This is an example of Random Forest algorithm from dislib. To show the usage, the code generates a synthetical input matrix. The results are printed by screen. This application used [dislib-0.9.0](https://github.com/bsc-wdc/dislib/tree/release-0.9) """ ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.748.1" ; ns1:image ; ns1:keywords "Machine Learning, PyCOMPSs, Python, COMPSs, dislib, non_data_persistence, Supercomputer, Marenostrum IV" ; ns1:license ; ns1:name "Random Forest Classifier executed in 3 nodes, 1 master and 2 workers, with a generated dataset, using 1 Million rows x 100 features" ; ns1:producer , ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#Amel_4.5_scaffolds.fa.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#amel_OGSv3.2.gff3.gz" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#forager.bw" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#forager_Amel4.5_accepted_hits.bam" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-14T15:59:09Z"^^ns1:Date ; ns1:dateModified "2025-11-04T09:59:27Z"^^ns1:Date ; ns1:description "Refining Genome Annotations with Apollo" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.749.1" ; ns1:input , , , ; ns1:isPartOf ; ns1:keywords "genome-annotation" ; ns1:license ; ns1:name "Refining Genome Annotations with Apollo (prokaryotes)" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STAR parameter" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "turtle file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "maximum memory usage in megabytes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Run STAR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STAR" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bowtie2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genome fasta" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Transcripts" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Input sequence reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Bandage Image: Assembly Graph Image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly (Graphical Fragment Assembly)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly (assembly_graph)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly (assembly_info)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly (consensus)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Flye assembly statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Quast: HTML report" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2026-02-20T03:01:51Z"^^ns1:Date ; ns1:dateModified "2026-02-20T03:01:51Z"^^ns1:Date ; ns1:description "Assemble long reads with Flye, then view assembly statistics and assembly graph" ; ns1:input ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "assembly-with-flye/main" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 4 . a ns1:Person ; ns1:name "Galaxy" . a ns1:Person ; ns1:name "VGP" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly to leave alone (For Merqury comparison)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly to purge" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Database for Busco Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Estimated genome size - Parameter File" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Genomescope model parameters" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Haplotype" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Meryl Database" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of purged assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Name of un-altered assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Pacbio Reads Collection - Trimmed" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species Name" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Assembly statistics for both assemblies" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged Assembly: Full Table" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged Assembly: Full Table Busco" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged Assembly: Miniprot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged Assembly: Summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Compleasm on purged Assembly: Translated Proteins" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Cutoffs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Haplotype for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Lineage for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury on Phased assemblies: Images" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Merqury on Phased assemblies: stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Nx Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged assembly (GFA)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Purged assembly statistics" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Read Coverage and cutoffs calculation: Histogram plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Removed haplotigs" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Size Plot" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Species for report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "clean_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_QV" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "merqury_stats" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_01.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.assembly_02.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-asm.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "output_merqury.spectra-cn.fl" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "qv_files" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2026-01-29T03:01:47Z"^^ns1:Date ; ns1:dateModified "2026-01-29T03:01:48Z"^^ns1:Date ; ns1:description "Purge contigs marked as duplicates by purge_dups in a single haplotype (could be haplotypic duplication or overlap duplication). If you think the purged contigs might belong to the other haplotype, use the workflow VGP6 instead. This workflow is the 6th workflow of the VGP pipeline. It is meant to be run after one of the contigging steps (Workflow 3, 4, or 5)." ; ns1:input , , , , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Purge-duplicates-one-haplotype-VGP6b/main" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 19 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Raw genome sequence" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker masked sequence on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker output log on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker repeat catalogue on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RepeatMasker repeat statistics on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:38:35Z"^^ns1:Date ; ns1:dateModified "2025-11-04T09:55:52Z"^^ns1:Date ; ns1:description "Masking repeats in a genome using RepeatMasker" ; ns1:input ; ns1:isPartOf ; ns1:keywords "genome-annotation" ; ns1:license ; ns1:name "Masking repeats with RepeatMasker" ; ns1:output , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Alternate annotation gbk" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Alternate annotation gff3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Genome assembly" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/NCBI submission template" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Protein evidence sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNASeq reads forward" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/RNASeq reads reverse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/AEGeAN report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Busco image" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Busco summary" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Final annotation (CDS sequences)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Final annotation (GFF3)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Final annotation (genbank)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Final annotation (mRNA sequences)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Final annotation (protein sequences)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Funannotate compare report" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/JBrowse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Mapped RNASeq" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:39:50Z"^^ns1:Date ; ns1:dateModified "2025-11-06T13:19:21Z"^^ns1:Date ; ns1:description "Structural and functional genome annotation with Funannotate" ; ns1:input , , , , , , ; ns1:isPartOf ; ns1:keywords "genome-annotation" ; ns1:license ; ns1:name "Genome annotation with Funannotate" ; ns1:output , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Protein sequences" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:44:05Z"^^ns1:Date ; ns1:dateModified "2025-11-06T13:19:05Z"^^ns1:Date ; ns1:description "Functional annotation of protein sequences" ; ns1:input ; ns1:isPartOf ; ns1:keywords "genome-annotation" ; ns1:license ; ns1:name "Functional protein annotation using EggNOG-mapper and InterProScan" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:46:22Z"^^ns1:Date ; ns1:dateModified "2025-11-04T09:51:42Z"^^ns1:Date ; ns1:description "From Copernicus Sentinel 5P data to panoply visualization of volcanic activity impact to atmosphere" ; ns1:isPartOf ; ns1:keywords "Climate" ; ns1:license ; ns1:name "Sentinel 5P volcanic data visualization" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Muon Spectroscopy Computational Project" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Copper-out.cell" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Copper.castep" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Copper.den_fmt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "All muons in host material" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2024-02-15T11:52:27Z"^^ns1:Date ; ns1:dateModified "2025-11-06T13:18:35Z"^^ns1:Date ; ns1:description "Finding potential muon stopping sites in crystalline copper" ; ns1:input , , ; ns1:isPartOf ; ns1:keywords "" ; ns1:license ; ns1:name "Finding the Muon Stopping Site using PyMuonSuite" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Josse" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Occurrence.csv" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:58:49Z"^^ns1:Date ; ns1:dateModified "2025-11-06T13:18:06Z"^^ns1:Date ; ns1:description "Calculating and visualizing marine biodiversity indicators" ; ns1:input ; ns1:isPartOf , ; ns1:keywords "Ecology, Earth-system" ; ns1:license ; ns1:name "Calculating and visualizing OBIS marine biodiversity indicators" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Marie Jossé" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-15T11:59:36Z"^^ns1:Date ; ns1:dateModified "2025-11-06T13:17:35Z"^^ns1:Date ; ns1:description "NDVI data with OpenEO to time series visualisation with HoloViz" ; ns1:isPartOf ; ns1:keywords "Ecology" ; ns1:license ; ns1:name "Visualizing NDVI time-series data with HoloViz" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2020-11-23T16:22:43Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:46:15Z"^^ns1:Date ; ns1:description """Workflow to build different indices for different tools from a genome and transcriptome. This workflow expects an (annotated) genome in GBOL ttl format. Steps: - SAPP: rdf2gtf (genome fasta) - SAPP: rdf2fasta (transcripts fasta) - STAR index (Optional for Eukaryotic origin) - bowtie2 index - kallisto index """ ; ns1:image ; ns1:input , , , , ; ns1:keywords "Alignment" ; ns1:license ; ns1:name "Indices builder from GBOL RDF (TTL)" ; ns1:output , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ligand frcmod file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/ligand mol2 file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/protein-ligand complex pdb file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Build tLEaP: tleap.in" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Build tLEaP: tleap.log" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of coordinate files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of mol2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of mol3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of pdb" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of topology files" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/Collection of txt" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GROMACS calculation of RMSD on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/GROMACS calculation of RMSF on input dataset(s)" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_1" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_10" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_11" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_12" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_13" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_14" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_15" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_16" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_17" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_18" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_19" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_2" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_20" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_21" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_22" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_23" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_24" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_25" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_26" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_27" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_28" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_29" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_30" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_31" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_32" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_33" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_34" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_35" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_36" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_37" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_38" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_39" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_4" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_40" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_41" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_42" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_43" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_44" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_45" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_5" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_6" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_7" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_8" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/_anonymous_output_9" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "#main/input dataset(s) (extracted element)" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator ; ns1:dateCreated "2024-02-16T17:23:53Z"^^ns1:Date ; ns1:dateModified "2024-03-02T16:44:04Z"^^ns1:Date ; ns1:description """This is a Galaxy workflow for performing molecular dynamics simulations and analysis with flavivirus helicases bound to a ligand/drug molecule. The associated input files can be found at: https://zenodo.org/records/7493015 The associated output files can be found at: https://zenodo.org/records/7850935""" ; ns1:identifier "https://doi.org/10.48546/workflowhub.workflow.761.1" ; ns1:input , , ; ns1:keywords "helicase, rna virus, zika, dengue, west nile, NS3, molecular dynamics" ; ns1:license ; ns1:name "flavivirushelicase_proteindrugcomplex" ; ns1:output , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bowtie2 index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filer rRNA" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "GTF file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto index" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Max memory" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filename prefix" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "bowtie2 output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FASTQC" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FeatureCounts output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered reads folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2020-11-24T11:05:56Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:46:21Z"^^ns1:Date ; ns1:description """Workflow for NonSpliced RNAseq data with multiple aligners. Steps: - workflow_quality.cwl: - FastQC (control) - fastp (trimming) - bowtie2 (read mapping) - sam_to_sorted-bam - featurecounts (transcript read counts) - kallisto (transcript [pseudo]counts) """ ; ns1:image ; ns1:input , , , , , , , , ; ns1:keywords "Alignment, bowtie2, featurecounts, kallisto, nonspliced" ; ns1:license ; ns1:name "NonSpliced RNAseq workflow" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T12:52:22Z"^^ns1:Date ; ns1:dateModified "2023-05-30T12:07:57Z"^^ns1:Date ; ns1:description "Analysis of variation within individual COVID-19 samples using Illumina Paired End data. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - PE Variation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset Collection" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "NC_045512" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T12:54:10Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:12Z"^^ns1:Date ; ns1:description "Analysis of variation within individual COVID-19 samples using Illumina Single End data. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input , ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - SE Variation" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:Person ; ns1:name "Andrew Lonie" . a ns1:Person ; ns1:name "Anton Nekrutenko" . a ns1:Person ; ns1:name "Bert Droesbeke" . a ns1:Person ; ns1:name "Björn Grüning" . a ns1:Person ; ns1:name "Dannon Baker" . a ns1:Person ; ns1:name "Dave Bouvier" . a ns1:Person ; ns1:name "Delphine Larivière" . a ns1:Person ; ns1:name "Frederik Coppens" . a ns1:Person ; ns1:name "Gildas Le Corguillé" . a ns1:Person ; ns1:name "Ignacio Eguinoa" . a ns1:Person ; ns1:name "James Taylor" . a ns1:Person ; ns1:name "John Chilton" . a ns1:Person ; ns1:name "Marius van den Beek" . a ns1:Person ; ns1:name "Nate Coraor" . a ns1:Person ; ns1:name "Nicholas Keener" . a ns1:Person ; ns1:name "Sergei Kosakovsky Pond" . a ns1:Person ; ns1:name "Simon Gladman" . a ns1:Person ; ns1:name "Steven Weaver" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "0_Input Dataset" . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "folder where the STAR indices are" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "filter_rrna" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "forward reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "gtf" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "folder where the kallisto indices are" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "maximum memory usage in megabytes" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "prefix_id" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "quantMode" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "number of threads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "STAR output folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FASTQC" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "FeatureCounts output" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Filtered reads folder" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "kallisto output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , ; ns1:dateCreated "2020-12-22T15:53:49Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:46:30Z"^^ns1:Date ; ns1:description """Workflow for Spliced RNAseq data **Steps:** * workflow_quality.cwl: * FastQC (Read Quality Control) * fastp (Read Trimming) * STAR (Read mapping) * featurecounts (transcript read counts) * kallisto (transcript [pseudo]counts) """ ; ns1:image ; ns1:input , , , , , , , , , ; ns1:keywords "RNASEQ, rna, rna-seq, kallisto, STAR" ; ns1:license ; ns1:name "Spliced RNAseq workflow" ; ns1:output , , , , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputerLanguage ; ns1:alternateName "CWL" ; ns1:identifier ; ns1:name "Common Workflow Language" ; ns1:url . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step11_grompp_npt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step13_grompp_md_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step14_mdrun_md_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step1_pdb_file" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step2_editconf_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step4_grompp_genion_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step5_genion_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step6_grompp_min_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step8_make_ndx_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "step9_grompp_nvt_config" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "whole workflow output" . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-01-29T16:56:33Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:46:36Z"^^ns1:Date ; ns1:description """CWL version of the md_list.cwl workflow for HPC. """ ; ns1:image ; ns1:input , , , , , , , , , ; ns1:isBasedOn ; ns1:keywords "" ; ns1:license ; ns1:name "Example of setting up a simulation system" ; ns1:output ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 2 . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Forwards reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "Reverse reads" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_3" . a ns1:FormalParameter ; dcterms:conformsTo ; ns1:name "_anonymous_output_4" . a ns1:ComputerLanguage ; ns1:identifier ; ns1:name "Galaxy" ; ns1:url . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:dateCreated "2021-02-02T19:06:59Z"^^ns1:Date ; ns1:dateModified "2023-02-13T14:06:45Z"^^ns1:Date ; ns1:description """Galaxy version of pre-processing of reads from COVID-19 samples. QC + human read cleaning Based on https://github.com/Finn-Lab/Metagen-FastQC/blob/master/metagen-fastqc.sh""" ; ns1:input , ; ns1:keywords "" ; ns1:license ; ns1:name "COVID-19: read pre-processing" ; ns1:output , ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . a ns1:ComputationalWorkflow, ns1:SoftwareSourceCode ; dcterms:conformsTo ; ns1:creator , , , , , , , , , , , , , , , , , , ; ns1:dateCreated "2020-04-10T13:00:36Z"^^ns1:Date ; ns1:dateModified "2023-01-16T13:40:19Z"^^ns1:Date ; ns1:description "Analysis of S-protein polymorphism. This workflow includes: obtaining coding sequences of S proteins from a diverse group of coronaviruses and generating amino acid alignments to assess conservation of the polymorphic location. More info can be found at https://covid19.galaxyproject.org/genomics/" ; ns1:image ; ns1:input ; ns1:keywords "covid-19" ; ns1:license ; ns1:name "Genomics - Analysis of S-protein polymorphism" ; ns1:producer ; ns1:programmingLanguage ; ns1:sdPublisher ; ns1:url ; ns1:version 1 . <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing)> a ns1:Person ; ns1:name "The Workflows and Distributed Computing Team (https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing)" . a ns1:Person ; ns1:name "Job van Riet" . a ns1:Person ; ns1:name "Fabrice Touzain" . a ns1:Person ; ns1:name "Adam Tofilski" . a ns1:Person ; ns1:name "Daan Hazelaar", "Daan Hazelaar" . a ns1:Person ; ns1:name "Anna Syme" . a ns1:Collection ; ns1:name "TronFlow" . a ns1:Person ; ns1:name "Valentine Murigneux" . a ns1:Person ; ns1:name "Alexandre Cormier" . a ns1:Person ; ns1:name "Cyril Noel" . a ns1:Person ; ns1:name "Pierre Cuzin" . a ns1:Person ; ns1:name "Mike Thang" . a ns1:Person ; ns1:name "Philipp Gormanns" . a ns1:Person ; ns1:name "Ivan Topolsky" . a ns1:Person ; ns1:name "Kary Ocaña" . a ns1:Person ; ns1:name "Luca Pireddu" . a ns1:Person ; ns1:name "Varsha Kale" . a ns1:Person ; ns1:name "Cristiane Taniguti" . a ns1:Person ; ns1:name "Diego Garrido-Martín" . a ns1:Person ; ns1:name "Roderic Guigó" . a ns1:Person ; ns1:name "Stevie Pederson" . a ns1:Person ; ns1:name "Andrea Furlani" . a ns1:Person ; ns1:name "Anne Fouilloux" . a ns1:Person ; ns1:name "Changlin Ke" . a ns1:Person ; ns1:name "Ziad Al-Bkhetan" . a ns1:Person ; ns1:name "Michael R. Crusoe" . a ns1:Person ; ns1:name "Pavankumar Videm" . a ns1:Organization, ns1:Project ; ns1:name "Air Quality Prediction" . a ns1:Organization, ns1:Project ; ns1:name "Read2Map" . a ns1:Organization, ns1:Project ; ns1:name "ParslRNA-Seq: an efficient and scalable RNAseq analysis workflow for studies of differentiated gene expression" . a ns1:Organization, ns1:Project ; ns1:name "Apis-wings" . a ns1:Organization, ns1:Project ; ns1:name "Black Ochre Data Labs" . a ns1:Organization, ns1:Project ; ns1:name "nf-core" . a ns1:Organization, ns1:Project ; ns1:name "Workflows for geographic science" . a ns1:Organization, ns1:Project ; ns1:name "Katdetectr" . a ns1:Organization, ns1:Project ; ns1:name "ANSES-Ploufragan" . a ns1:Organization, ns1:Project ; ns1:name "ERGA Annotation" . a ns1:Organization, ns1:Project ; ns1:name "CholGen" . a ns1:Organization, ns1:Project ; ns1:name "BY-COVID (general)" . a ns1:Organization, ns1:Project ; ns1:name "EOSC-Life - Demonstrator 7: Rare Diseases" . a ns1:Organization, ns1:Project ; ns1:name "Pawsey Supercomputing Research Centre" . a ns1:Organization, ns1:Project ; ns1:name "SeBiMER" . a ns1:Organization, ns1:Project ; ns1:name "nf-core viralrecon" . a ns1:Organization, ns1:Project ; ns1:name "Snakemake-Workflows" . a ns1:Organization, ns1:Project ; ns1:name "Applied Computational Biology at IEG/HMGU" . a ns1:Organization, ns1:Project ; ns1:name "INFRAFRONTIER workflows" . a ns1:Organization, ns1:Project ; ns1:name "V-Pipe" . a ns1:Organization, ns1:Project ; ns1:name "TransBioNet" . a ns1:Organization, ns1:Project ; ns1:name "CRC Cohort" . a ns1:Organization, ns1:Project ; ns1:name "Workflows Australia" . a ns1:Organization, ns1:Project ; ns1:name "usegalaxy.be workflows" . a ns1:Person ; ns1:name "Javier Conejero" . a ns1:Person ; ns1:name "Simon Bray" . a ns1:Person ; ns1:name "Anthony Bretaudeau" . a ns1:Collection ; ns1:name "CEITEC CryoEM Facility Workflows" . a ns1:Collection ; ns1:name "Vertebrate Genomes Pipelines (VGP) workflows" . a ns1:Person ; ns1:name "Juma Bayjan" . a ns1:Person ; ns1:name "Friederike Ehrhart" . a ns1:Person ; ns1:name "Ozan Ozisik" . a ns1:Person ; ns1:name "Patrick Durand" . a ns1:Person ; ns1:name "Laura Leroi" . a ns1:Person ; ns1:name "Laurence Livermore" . a ns1:Person ; ns1:name "Germán Royval" . a ns1:Person ; ns1:name "Cristóbal Gallardo" . a ns1:Person ; ns1:name "Cenna Doornbos" . a ns1:Person ; ns1:name "Ambarish Kumar" . a ns1:Person ; ns1:name "Jeanette Reinshagen" . a ns1:Person ; ns1:name "Ekaterina Sakharova" . a ns1:Person ; ns1:name "Konstantinos Kyritsis" . a ns1:Organization, ns1:Project ; ns1:name "TRON gGmbH" . a ns1:Organization, ns1:Project ; ns1:name "Euro-BioImaging" . a ns1:Organization, ns1:Project ; ns1:name "SANBI Pathogen Bioinformatics" . a ns1:Organization, ns1:Project ; ns1:name "eFlows4HPC general" . a ns1:Organization, ns1:Project ; ns1:name "Scipion CNB" . a ns1:Organization, ns1:Project ; ns1:name "Biodiversity Genomics Europe (general)" . a ns1:Organization, ns1:Project ; ns1:name "SARS-CoV-2 Data Hubs" . a ns1:Organization, ns1:Project ; ns1:name "EOSC4Cancer" . a ns1:Organization, ns1:Project ; ns1:name "CO2MICS Lab" . a ns1:Organization, ns1:Project ; ns1:name "EU-Openscreen" . a ns1:Person ; ns1:name "Fotis Psomopoulos" . a ns1:Person ; ns1:name "Matthieu Muffato" . a ns1:Person ; ns1:name "Nikolaos Pechlivanis" . a ns1:Collection ; ns1:name "ERGA Assembly Galaxy Long Reads & Hi-C Pipelines (Hifiasm-solo + Purge_Dups + YaHS)" . a ns1:Collection ; ns1:name "HiFi genome assembly on Galaxy" . a ns1:Person ; ns1:name "Gareth Price" . a ns1:Person ; ns1:name "Oliver Woolland" . a ns1:Person ; ns1:name "Melchior du Lac" . a ns1:Person ; ns1:name "Jorge Ejarque" . a ns1:Organization, ns1:Project ; ns1:name "Seq4AMR" . a ns1:Organization, ns1:Project ; ns1:name "NanoGalaxy" . a ns1:Organization, ns1:Project ; ns1:name "Biodata Analysis Group" . a ns1:Organization, ns1:Project ; ns1:name "Tree of Life Genome Analysis" . a ns1:Organization, ns1:Project ; ns1:name "EJPRD WP13 case-studies workflows" . a ns1:Person ; ns1:name " Priyanka Surana", "Priyanka Surana" . a ns1:Person ; ns1:name "Adam Hospital" . a ns1:Person ; ns1:name "Petr Walczysko" . a ns1:Collection ; ns1:name "ERGA Assembly Galaxy ONT+Illumina & HiC Pipelines (NextDenovo-HyPo + Purge_Dups + YaHS)" . a ns1:Collection ; ns1:name "ERGA Assembly Galaxy ONT+Illumina & HiC Pipelines (Flye-HyPo + Purge_Dups + YaHS)" . a ns1:Collection ; ns1:name "BioExcel Building Blocks (BioBB) Protein MD Setup Tutorials" . a ns1:Person ; ns1:name "Coline Royaux" . a ns1:Person ; ns1:name "Sergi Sayols" . a ns1:Organization, ns1:Project ; ns1:name "Cluster Emergent del Cervell Humà" . a ns1:Organization, ns1:Project ; ns1:name "HoloFood at MGnify" . a ns1:Organization, ns1:Project ; ns1:name "IMBforge" . a ns1:Organization, ns1:Project ; ns1:name "UX trial team" . a ns1:Organization, ns1:Project ; ns1:name "iPC: individualizedPaediatricCure" . a ns1:Organization, ns1:Project ; ns1:name "Specimen Data Refinery" . a ns1:Collection ; ns1:name "ERGA Assembly Galaxy Long Reads & Hi-C Pipelines (Hifiasm-HiC + Purge_Dups + YaHS)" . a ns1:Collection ; ns1:name "16S Microbial Analysis with mothur (on Galaxy Australia)" . a ns1:Collection ; ns1:name "Freely accessible ready to use global infrastructure and workflows for SARS-CoV-2 monitoring" . a ns1:Collection ; ns1:name "IDR" . a ns1:Person ; ns1:name "José Mª Fernández" . a ns1:Person ; ns1:name "Douglas Lowe" . a ns1:Person ; ns1:name "Javier Conejero" . a ns1:Organization, ns1:Project ; ns1:name "Integrated and Urban Plant Pathology Laboratory" . a ns1:Organization, ns1:Project ; ns1:name "CWL workflow SARS-CoV-2" . a ns1:Person ; ns1:name "Yasmmin Martins" . a ns1:Person ; ns1:name "Delphine Lariviere" . a ns1:Person ; ns1:name "Cali Willet" . a ns1:Person ; ns1:name "Martin Beracochea" . a ns1:Person ; ns1:name "Saskia Hiltemann" . a ns1:Organization, ns1:Project ; ns1:name "IBISBA Workflows" . a ns1:Organization, ns1:Project ; ns1:name "yPublish - Bioinfo tools" . a ns1:Collection ; ns1:name "COMPSs Workflows from 2024 Tutorial" . a ns1:Collection ; ns1:name "scRNAseq processing in galaxy" . a ns1:Person ; ns1:name "Rosemarie Sadsad" . a ns1:Person ; ns1:name "Jean-Marie Burel" . a ns1:Person ; ns1:name "Wolfgang Maier" . a ns1:Organization, ns1:Project ; ns1:name "PerMedCoE" . a ns1:Organization, ns1:Project ; ns1:name "Galaxy Climate" . a ns1:Organization, ns1:Project ; ns1:name "COMPSs Tutorials" . a ns1:Organization, ns1:Project ; ns1:name "OME" . a ns1:Organization, ns1:Project ; ns1:name "MGnify" . <#The%20Workflows%20and%20Distributed%20Computing%20Team%20(https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)> a ns1:Person ; ns1:name "The Workflows and Distributed Computing Team (https://www.bsc.es/discover-bsc/organisation/scientific-structure/workflows-and-distributed-computing/)" . a ns1:Person ; ns1:name "Rosa M Badia", "Rosa M. Badia" . a ns1:Person ; ns1:name "Georgina Samaha" . a ns1:Person ; ns1:name "Laura Rodriguez-Navas" . a ns1:Organization, ns1:Project ; ns1:name "Galaxy Training Network" . a ns1:Organization, ns1:Project ; ns1:name "EuroScienceGateway" . a ns1:Collection ; ns1:name "Workflows in EuroScienceGateway" . a ns1:Collection ; ns1:name "BY-COVID related workflows" . a ns1:Person ; ns1:name "Diego De Panis" . a ns1:Person ; ns1:name "Yvan Le Bras" . a ns1:Person ; ns1:name "Lucille Delisle" . a ns1:Collection ; ns1:name "Tutorials for BioExcel Building Blocks (BioBB)" . a ns1:Organization, ns1:Project ; ns1:name "PNDB" . a ns1:Collection ; ns1:name "Biodiversity & ecology workflows" . a ns1:Person ; ns1:name "Jasper Koehorst" . a ns1:Organization, ns1:Project ; ns1:name "ERGA Assembly" . a ns1:Person ; ns1:name "Tracy Chew" . a ns1:Person ; ns1:name "Bart Nijsse" . a ns1:Organization, ns1:Project ; ns1:name "UNLOCK" . a ns1:Organization, ns1:Project ; ns1:name "usegalaxy-eu" . a ns1:Collection ; ns1:name "Interactive Jupyter Notebooks for FAIR and reproducible biomolecular simulation workflows" . a ns1:Organization, ns1:Project ; ns1:name "Sydney Informatics Hub" . a ns1:Organization, ns1:Project ; ns1:name "Workflows and Distributed Computing" . a ns1:Person ; ns1:name "Anna Syme" . a ns1:Organization, ns1:Project ; ns1:name "QCIF Bioinformatics" . a ns1:Organization, ns1:Project ; ns1:name "GalaxyProject SARS-CoV-2" . a ns1:Organization, ns1:Project ; ns1:name "Galaxy Australia" . a ns1:Organization, ns1:Project ; ns1:name "Australian BioCommons" . a ns1:Collection ; ns1:name "BioCommons ‘Bring Your Own Data’ Expansion Project" . a ns1:Organization, ns1:Project ; ns1:name "Intergalactic Workflow Commission (IWC)" . a ns1:Person ; ns1:name "Genís Bayarri" . a ns1:Person ; ns1:name "Adam Hospital" . a ns1:Organization, ns1:Project ; ns1:name "BioBB Building Blocks" . a ns1:Organization ; ns1:name "WorkflowHub" ; ns1:url .