syntax = "proto3"; import "google/protobuf/struct.proto"; package vg; // *Graphs* are collections of nodes and edges. // They can represent subgraphs of larger graphs // or be wholly-self-sufficient. // Protobuf memory limits of 67108864 bytes mean we typically keep the size // of them small generating graphs as collections of smaller subgraphs. // message Graph { repeated Node node = 1; // The `Node`s that make up the graph. repeated Edge edge = 2; // The `Edge`s that connect the `Node`s in the graph. repeated Path path = 3; // A set of named `Path`s that visit sequences of oriented `Node`s. } // *Nodes* store sequence data. message Node { string sequence = 1; // Sequence of DNA bases represented by the Node. string name = 2; // A name provides an identifier. int64 id = 3; // Each Node has a unique positive nonzero ID within its Graph. } // *Edges* describe linkages between nodes. They are bidirected, connecting the // end (default) or start of the "from" node to the start (default) or end of // the "to" node. // message Edge { int64 from = 1; // ID of upstream node. int64 to = 2; // ID of downstream node. bool from_start = 3; // If the edge leaves from the 5' (start) of a node. bool to_end = 4; // If the edge goes to the 3' (end) of a node. int32 overlap = 5; // Length of overlap between the connected `Node`s. } // Edits describe how to generate a new string from elements // in the graph. To determine the new string, just walk the series of edits, // stepping from_length distance in the basis node, and to_length in the // novel element, replacing from_length in the basis node with the sequence. // // // There are several types of Edit: // - *matches*: from_length == to_length; sequence is empty // - *snps*: from_length == to_length; sequence = alt // - *deletions*: to_length == 0 && from_length > to_length; sequence is empty // - *insertions*: from_length < to_length; sequence = alt // message Edit { int32 from_length = 1; // Length in the target/ref sequence that is removed. int32 to_length = 2; // Length in read/alt of the sequence it is replaced with. string sequence = 3; // The replacement sequence, if different from the original sequence. } // A Mapping defines the relationship between a node in system and another entity. // An empty edit list implies complete match, however it is preferred to specify the full edit structure. // as it is more complex to handle special cases. // message Mapping { Position position = 1; // The position at which the first Edit, if any, in the Mapping starts. Inclusive. repeated Edit edit = 2; // The series of `Edit`s to transform to region in read/alt. int64 rank = 5; // The 1-based rank of the mapping in its containing path. } // A position in the graph is a node, direction, and offset. // The node is stored by ID, and the offset is 0-based and counts from the start of the node in the specified orientation. // The direction specifies which orientation of the node we are considering, the forward (as stored) or reverse complement. // // Example: // // seq+ G A T T A C A // offset+ → 0 1 2 3 4 5 6 7 // // seq- C T A A T G T // offset- → 0 1 2 3 4 5 6 7 // // Or both at once: // // offset- 7 6 5 4 3 2 1 0 ← // seq+ G A T T A C A // offset+ → 0 1 2 3 4 5 6 7 // // A Position can also, with the `name` and `offset` fields, be used to represent a distance along a named `Path`. // TODO: Is this an appropriate hack? Or should we add a new message? message Position { int64 node_id = 1; // The Node on which the Position is. int64 offset = 2; // The offset into that node's sequence at which the Position occurs. bool is_reverse = 4; // True if we obtain the original sequence of the path by reverse complementing the mappings. string name = 5; // If the position is used to represent a position against a reference path } // Paths are walks through nodes defined by a series of `Edit`s. // They can be used to represent: // - haplotypes // - mappings of reads, or alignments, by including edits // - relationships between nodes // - annotations from other data sources, such as: // genes, exons, motifs, transcripts, peaks // message Path { string name = 1; // The name of the path. Path names starting with underscore (_) are reserved for internal VG use. repeated Mapping mapping = 2; // The `Mapping`s which describe the order and orientation in which the Path visits `Node`s. bool is_circular = 3; // Set to true if the path is circular. int64 length = 4; // Optional length annotation for the Path. } // Alignments link query strings, such as other genomes or reads, to Paths. // message Alignment { string sequence = 1; // The sequence that has been aligned. Path path = 2; // The Path that the sequence follows in the graph it has been aligned to, containing the `Edit`s that modify the graph to produce the sequence. string name = 3; // The name of the sequence that has been aligned. Similar to read name in BAM. bytes quality = 4; // The quality scores for the sequence, as values on a 0-255 scale. int32 mapping_quality = 5; // The mapping quality score for the alignment, in Phreds. int32 score = 6; // The score for the alignment, in points. int32 query_position = 7; // The offset in the query at which this Alignment occurs. reserved 8; // Old field 8 has been removed string sample_name = 9; // The name of the sample that produced the aligned read. string read_group = 10; // The name of the read group to which the aligned read belongs. Alignment fragment_prev = 11; // The previous Alignment in the fragment. Contains just enough information to locate the full Alignment; e.g. contains an Alignment with only a name, or only a graph mapping position. Alignment fragment_next = 12; // Similarly, the next Alignment in the fragment. bool is_secondary = 15; // Flag marking the Alignment as secondary. All but one maximal-scoring alignment of a given read in a GAM file must be secondary. double identity = 16; // Portion of aligned bases that are perfect matches, or 0 if no bases are aligned. repeated Path fragment = 17; // An estimate of the length of the fragment, if this Alignment is paired. repeated Locus locus = 18; // The loci that this alignment supports. TODO: get rid of this, we have annotations in our data model again. repeated Position refpos = 19; // Position of the alignment in reference paths embedded in graph. Each position has a path name, and the Alignment's minimum position along the path as an offset. // SAMTools-style flags bool read_paired = 20; bool read_mapped = 21; bool mate_unmapped = 22; bool read_on_reverse_strand = 23; bool mate_on_reverse_strand = 24; bool soft_clipped = 25; bool discordant_insert_size = 26; double uniqueness = 27; // The fraction of bases in the alignment that are covered by MEMs with <=1 total hits in the graph double correct = 28; // Correctness metric 1 = perfectly aligned to truth, 0 = not overlapping true alignment repeated int32 secondary_score = 29; // The ordered list of scores of secondary mappings double fragment_score = 30; // Score under the given fragment model, assume higher is better bool mate_mapped_to_disjoint_subgraph = 31; string fragment_length_distribution = 32; // The fragment length distribution under which a paired-end alignment was aligned. reserved 33 to 34; // Haplotype-scoring-related fields migrated to the annotation system. double time_used = 35; // The time this alignment took Position to_correct = 36; // A path/offset/orientation pair specifying the distance to the correct alignment bool correctly_mapped = 37; // This can be set to true to annotate the Alignment as having been mapped correctly. google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment. } // A subgraph of the unrolled Graph in which each non-branching path is associated with an alignment // of part of the read and part of the graph such that any path through the MultipathAlignment // indicates a valid alignment of a read to the graph message MultipathAlignment { string sequence = 1; bytes quality = 2; string name = 3; string sample_name = 4; string read_group = 5; // non-branching paths of the multipath alignment, each containing an alignment of part of // the sequence to a Graph // IMPORTANT: downstream applications will assume these are stored in topological order repeated Subpath subpath = 6; // -10 * log_10(probability of mismapping) int32 mapping_quality = 7; // optional: indices of Subpaths that align the beginning of the read (i.e. source nodes) repeated uint32 start = 8; string paired_read_name = 9; google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment. } // A non-branching path of a MultipathAlignment message Subpath { // describes node sequence and edits to the graph sequences Path path = 1; // the indices of subpaths in the multipath alignment that are to the right of this path // where right is in the direction of the end of the read sequence repeated uint32 next = 2; // score of this subpath's alignment int32 score = 3; // connections to other subpaths that are not necessarily contiguous in the graph repeated Connection connection = 4; } // An edge in a MultipathAlignment between Subpaths that may not be contiguous in the graph message Connection { // the index of the Subpath that this connection points to uint32 next = 1; // the score of this connection int32 score = 2; } // Used to serialize kmer matches. message KmerMatch { string sequence = 1; int64 node_id = 2; sint32 position = 3; // If true, this kmer is backwards relative to its node, and position counts from the end of the node. bool backward = 4; } // Summarizes reads that map to single position in the graph. // This structure is pretty much identical to a line in Samtools pileup format // if qualities set, it must have size = num_bases message BasePileup { int32 ref_base = 1; int32 num_bases = 2; string bases = 3; bytes qualities = 4; } // Collect pileup records by node. Saves some space and hashing over // storing individually, assuming not too sparse and avg. node length more than couple bases // the ith BasePileup in the array corresponds to the position at offset i. message NodePileup { int64 node_id = 1; repeated BasePileup base_pileup = 2; } // Keep pileup-like record for reads that span edges message EdgePileup { Edge edge = 1; int32 num_reads = 2; // total reads mapped int32 num_forward_reads = 3; // number of reads mapped on forward strand bytes qualities = 4; } // Bundle up Node and Edge pileups message Pileup { repeated NodePileup node_pileups = 1; repeated EdgePileup edge_pileups = 2; } // Enumeration of the classifications of snarls enum SnarlType { UNCLASSIFIED = 0; ULTRABUBBLE = 1; UNARY = 2; } // Describes a subgraph that is connected to the rest of the graph by two nodes. message Snarl { // What type of snarl is this? SnarlType type = 1; // Visits that connect the Snarl to the rest of the graph Visit start = 2; // points *INTO* the snarl Visit end = 3; // points *OUT OF* the snarl // If this Snarl is nested in another, this field should be filled in with a Snarl // that has the start and end visits filled in (other information is optional/extraneous) Snarl parent = 4; // Allows snarls to be named, e.g. by the hash of the VCF variant they come from. string name = 5; // Indicate whether there is a reversing path contained in the Snarl from either the // start to itself or the end to itself bool start_self_reachable = 6; bool end_self_reachable = 7; // Indicate whether the start of the Snarl is connected through to the end. bool start_end_reachable = 8; // Indicate whether the snarl's net graph is free of directed cycles bool directed_acyclic_net_graph = 9; } // Describes a step of a walk through a Snarl either on a node or through a child Snarl message Visit { // The node ID or snarl of this step (only one should be given) int64 node_id = 1; Snarl snarl = 2; // only needs to contain the start and end Visits string name = 4; // The segment name of this step. // Indicates: // if node_id is specified reverse complement of node // if snarl is specified traversal of a child snarl entering backwards through // end and leaving backwards through start bool backward = 3; } // Describes a walk through a Snarl where each step is given as either a node or // a child Snarl (leaving the walk through the child Snarl to another SnarlTraversal) message SnarlTraversal { // Steps of the walk through a Snarl, including the start and end nodes. If the // traversal includes a Visit that represents a Snarl, both the node entering the Snarl // and the node leaving the Snarl should be included in the traversal. repeated Visit visit = 1; // The name of the traversal can be used for a variant allele id (e.g. _0, _1... // or by some other arbitrary annotation , unique or non-unique, e.g. deleteterious, gain_of_function, etc., though these // will be lost in any indices). string name = 2; } // Describes a genetic locus with multiple possible alleles, a genotype, and observational support. message Locus { // A locus may have an identifying name. string name = 1; // These are all the alleles at the locus, not just the called ones. // Note that a primary reference allele may or may not appear. repeated Path allele = 2; // These supports are per-allele, matching the alleles above repeated Support support = 3; // sorted by likelihood or posterior // the first one is the "call" repeated Genotype genotype = 4; // We also have a Support for the locus overall, because reads may have // supported multiple alleles and we want to know how many total there were. Support overall_support = 5; // We track the likelihood of each allele individually, in addition to // genotype likelihoods. Stores the likelihood natural logged. repeated double allele_log_likelihood = 6; } // Describes a genotype at a particular locus. message Genotype { // These refer to the offsets of the alleles in the Locus object. repeated int32 allele = 1; bool is_phased = 2; double likelihood = 3; double log_likelihood = 4; // Likelihood natural logged. double log_prior = 5; // Prior natural logged. double log_posterior = 6; // Posterior natural logged (unnormalized). } // Aggregates information about the reads supporting an allele. message Support { // The overall quality of all the support, as -10 * log10(P(all support is wrong)) double quality = 1; // The number of supporting reads on the forward strand (which may be fractional) double forward = 2; // The number of supporting reads on the reverse strand (which may be fractional) double reverse = 3; // TODO: what is this? double left = 4; // TODO: What is this? double right = 5; } // Support pinned to a location, which can be either a node or an edge message LocationSupport { // The support Support support = 1; // The location oneof oneof_location { Edge edge = 2; int64 node_id = 3; } } // Translations map from one graph to another. // A collection of these provides a covering mapping between a from and to graph. // If each "from" path through the base graph corresponds to a "to" path in an updated graph, // then we can use these translations to project positions, mappings, and paths in the new // graph into the old one using the Translator interface. message Translation { Path from = 1; Path to = 2; }