syntax = "proto3";

import "google/protobuf/struct.proto";

package vg;

// *Graphs* are collections of nodes and edges.
// They can represent subgraphs of larger graphs
// or be wholly-self-sufficient.
// Protobuf memory limits of 67108864 bytes mean we typically keep the size
// of them small generating graphs as collections of smaller subgraphs.
//
message Graph {
    repeated Node node = 1; // The `Node`s that make up the graph.
    repeated Edge edge = 2; // The `Edge`s that connect the `Node`s in the graph.
    repeated Path path = 3; // A set of named `Path`s that visit sequences of oriented `Node`s.
}

// *Nodes* store sequence data.
message Node {
    string sequence = 1;   // Sequence of DNA bases represented by the Node.
    string name = 2;  // A name provides an identifier.
    int64 id = 3;     // Each Node has a unique positive nonzero ID within its Graph.
}

// *Edges* describe linkages between nodes. They are bidirected, connecting the
// end (default) or start of the "from" node to the start (default) or end of
// the "to" node.
//
message Edge {
    int64 from = 1; // ID of upstream node.
    int64 to = 2;   // ID of downstream node.
    bool from_start = 3; // If the edge leaves from the 5' (start) of a node.
    bool to_end = 4; // If the edge goes to the 3' (end) of a node.
    int32 overlap = 5; // Length of overlap between the connected `Node`s.
}

// Edits describe how to generate a new string from elements
// in the graph. To determine the new string, just walk the series of edits,
// stepping from_length distance in the basis node, and to_length in the
// novel element, replacing from_length in the basis node with the sequence.
//
//
// There are several types of Edit:
// - *matches*: from_length == to_length; sequence is empty
// - *snps*: from_length == to_length; sequence = alt
// - *deletions*: to_length == 0 && from_length > to_length; sequence is empty
// - *insertions*: from_length < to_length; sequence = alt
//
message Edit {
    int32 from_length = 1; // Length in the target/ref sequence that is removed.
    int32 to_length = 2; // Length in read/alt of the sequence it is replaced with.
    string sequence = 3; // The replacement sequence, if different from the original sequence.
}

// A Mapping defines the relationship between a node in system and another entity.
// An empty edit list implies complete match, however it is preferred to specify the full edit structure.
// as it is more complex to handle special cases.
//
message Mapping {
    Position position = 1; // The position at which the first Edit, if any, in the Mapping starts. Inclusive.
    repeated Edit edit = 2; // The series of `Edit`s to transform to region in read/alt.
    int64 rank = 5; // The 1-based rank of the mapping in its containing path.
}

// A position in the graph is a node, direction, and offset.
// The node is stored by ID, and the offset is 0-based and counts from the start of the node in the specified orientation.
// The direction specifies which orientation of the node we are considering, the forward (as stored) or reverse complement.
//
// Example:
//
//     seq+        G A T T A C A
//     offset+  → 0 1 2 3 4 5 6 7
//     
//     seq-        C T A A T G T
//     offset-  → 0 1 2 3 4 5 6 7
//
// Or both at once:
//
//     offset-    7 6 5 4 3 2 1 0 ←
//     seq+        G A T T A C A
//     offset+  → 0 1 2 3 4 5 6 7
//
// A Position can also, with the `name` and `offset` fields, be used to represent a distance along a named `Path`.
// TODO: Is this an appropriate hack? Or should we add a new message?

message Position {
    int64 node_id = 1; // The Node on which the Position is.
    int64 offset = 2; // The offset into that node's sequence at which the Position occurs.
    bool is_reverse = 4; // True if we obtain the original sequence of the path by reverse complementing the mappings.
    string name = 5; // If the position is used to represent a position against a reference path
}

// Paths are walks through nodes defined by a series of `Edit`s.
// They can be used to represent:
//    - haplotypes
//    - mappings of reads, or alignments, by including edits
//    - relationships between nodes
//    - annotations from other data sources, such as:
//          genes, exons, motifs, transcripts, peaks
//
message Path {
    string name = 1; // The name of the path. Path names starting with underscore (_) are reserved for internal VG use.
    repeated Mapping mapping = 2; // The `Mapping`s which describe the order and orientation in which the Path visits `Node`s.
    bool is_circular = 3; // Set to true if the path is circular.
    int64 length = 4; // Optional length annotation for the Path.
}

// Alignments link query strings, such as other genomes or reads, to Paths.
//
message Alignment {
    string sequence = 1; // The sequence that has been aligned.
    Path path = 2; // The Path that the sequence follows in the graph it has been aligned to, containing the `Edit`s that modify the graph to produce the sequence.
    string name = 3; // The name of the sequence that has been aligned. Similar to read name in BAM.
    bytes quality = 4; // The quality scores for the sequence, as values on a 0-255 scale.
    int32 mapping_quality = 5; // The mapping quality score for the alignment, in Phreds.
    int32 score = 6; // The score for the alignment, in points.
    int32 query_position = 7; // The offset in the query at which this Alignment occurs.
    reserved 8; // Old field 8 has been removed
    string sample_name = 9; // The name of the sample that produced the aligned read.
    string read_group = 10; // The name of the read group to which the aligned read belongs.
    Alignment fragment_prev = 11; // The previous Alignment in the fragment. Contains just enough information to locate the full Alignment; e.g. contains an Alignment with only a name, or only a graph mapping position.
    Alignment fragment_next = 12; // Similarly, the next Alignment in the fragment.
    bool is_secondary = 15; // Flag marking the Alignment as secondary. All but one maximal-scoring alignment of a given read in a GAM file must be secondary.
    double identity = 16; // Portion of aligned bases that are perfect matches, or 0 if no bases are aligned.
    repeated Path fragment = 17; // An estimate of the length of the fragment, if this Alignment is paired.
    repeated Locus locus = 18; // The loci that this alignment supports. TODO: get rid of this, we have annotations in our data model again.
    repeated Position refpos = 19; // Position of the alignment in reference paths embedded in graph. Each position has a path name, and the Alignment's minimum position along the path as an offset.
    
    // SAMTools-style flags
    bool read_paired = 20;
    bool read_mapped = 21;
    bool mate_unmapped = 22;
    bool read_on_reverse_strand = 23;
    bool mate_on_reverse_strand = 24;
    bool soft_clipped = 25;
    bool discordant_insert_size = 26;
    double uniqueness = 27; // The fraction of bases in the alignment that are covered by MEMs with <=1 total hits in the graph
    double correct = 28; // Correctness metric 1 = perfectly aligned to truth, 0 = not overlapping true alignment
    repeated int32 secondary_score = 29; // The ordered list of scores of secondary mappings
    double fragment_score = 30; // Score under the given fragment model, assume higher is better
    bool mate_mapped_to_disjoint_subgraph = 31;
    
    string fragment_length_distribution = 32; // The fragment length distribution under which a paired-end alignment was aligned.
    reserved 33 to 34; // Haplotype-scoring-related fields migrated to the annotation system.
    double time_used = 35; // The time this alignment took
    Position to_correct = 36; // A path/offset/orientation pair specifying the distance to the correct alignment
    bool correctly_mapped = 37; // This can be set to true to annotate the Alignment as having been mapped correctly.
    
    google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment.
}

// A subgraph of the unrolled Graph in which each non-branching path is associated with an alignment
// of part of the read and part of the graph such that any path through the MultipathAlignment
// indicates a valid alignment of a read to the graph
message MultipathAlignment {
    string sequence = 1;
    bytes quality = 2;
    string name = 3;
    string sample_name = 4;
    string read_group = 5;

    // non-branching paths of the multipath alignment, each containing an alignment of part of
    // the sequence to a Graph
    // IMPORTANT: downstream applications will assume these are stored in topological order
    repeated Subpath subpath = 6;

    // -10 * log_10(probability of mismapping)
    int32 mapping_quality = 7;

    // optional: indices of Subpaths that align the beginning of the read (i.e. source nodes)
    repeated uint32 start = 8;

    string paired_read_name = 9;
    
    google.protobuf.Struct annotation = 100; // Annotations carried along with the Alignment.
}

// A non-branching path of a MultipathAlignment
message Subpath {
    // describes node sequence and edits to the graph sequences
    Path path = 1;

    // the indices of subpaths in the multipath alignment that are to the right of this path
    // where right is in the direction of the end of the read sequence
    repeated uint32 next = 2;

    // score of this subpath's alignment
    int32 score = 3;
    
    // connections to other subpaths that are not necessarily contiguous in the graph
    repeated Connection connection = 4;
}

// An edge in a MultipathAlignment between Subpaths that may not be contiguous in the graph
message Connection {
    // the index of the Subpath that this connection points to
    uint32 next = 1;
    
    // the score of this connection
    int32 score = 2;
}

// Used to serialize kmer matches.
message KmerMatch {
    string sequence = 1;
    int64 node_id = 2;
    sint32 position = 3;
    
    // If true, this kmer is backwards relative to its node, and position counts from the end of the node.
    bool backward = 4; 
}

// Summarizes reads that map to single position in the graph.
// This structure is pretty much identical to a line in Samtools pileup format
// if qualities set, it must have size = num_bases
message BasePileup {
    int32 ref_base = 1;
    int32 num_bases = 2;
    string bases = 3;
    bytes qualities = 4;
}

// Collect pileup records by node.  Saves some space and hashing over
// storing individually, assuming not too sparse and avg. node length more than couple bases
// the ith BasePileup in the array corresponds to the position at offset i. 
message NodePileup {
    int64 node_id = 1;
    repeated BasePileup base_pileup = 2;
}

// Keep pileup-like record for reads that span edges
message EdgePileup {
    Edge edge = 1;
    int32 num_reads = 2;	// total reads mapped
    int32 num_forward_reads = 3; // number of reads mapped on forward strand
    bytes qualities = 4;
}

// Bundle up Node and Edge pileups
message Pileup {
    repeated NodePileup node_pileups = 1;
    repeated EdgePileup edge_pileups = 2;
}

// Enumeration of the classifications of snarls
enum SnarlType {
    UNCLASSIFIED = 0;
	ULTRABUBBLE = 1;
	UNARY = 2;
}

// Describes a subgraph that is connected to the rest of the graph by two nodes.
message Snarl {    
    // What type of snarl is this?
    SnarlType type = 1;

    // Visits that connect the Snarl to the rest of the graph
    Visit start = 2; // points *INTO* the snarl
    Visit end = 3;   // points *OUT OF* the snarl

    // If this Snarl is nested in another, this field should be filled in with a Snarl
    // that has the start and end visits filled in (other information is optional/extraneous)
    Snarl parent = 4;

    // Allows snarls to be named, e.g. by the hash of the VCF variant they come from.
    string name = 5;

    // Indicate whether there is a reversing path contained in the Snarl from either the
    // start to itself or the end to itself
    bool start_self_reachable = 6;
    bool end_self_reachable = 7;
    
    // Indicate whether the start of the Snarl is connected through to the end.
    bool start_end_reachable = 8;
    
    // Indicate whether the snarl's net graph is free of directed cycles
    bool directed_acyclic_net_graph = 9;
}

// Describes a step of a walk through a Snarl either on a node or through a child Snarl
message Visit {
    // The node ID or snarl of this step (only one should be given)
    int64 node_id = 1;
    Snarl snarl = 2; // only needs to contain the start and end Visits
    string name = 4; // The segment name of this step.

    // Indicates:
    //   if node_id is specified     reverse complement of node
    //   if snarl is specified       traversal of a child snarl entering backwards through
    //                               end and leaving backwards through start
    bool backward = 3;
}

// Describes a walk through a Snarl where each step is given as either a node or
// a child Snarl (leaving the walk through the child Snarl to another SnarlTraversal)
message SnarlTraversal {
    // Steps of the walk through a Snarl, including the start and end nodes. If the 
    // traversal includes a Visit that represents a Snarl, both the node entering the Snarl
    // and the node leaving the Snarl should be included in the traversal.
    repeated Visit visit = 1;

    // The name of the traversal can be used for a variant allele id (e.g. <parentSnarlHash>_0, <parentSnarlHash>_1...
    // or by some other arbitrary annotation , unique or non-unique, e.g. deleteterious, gain_of_function, etc., though these
    // will be lost in any indices).
    string name = 2;
}

// Describes a genetic locus with multiple possible alleles, a genotype, and observational support.
message Locus {
    // A locus may have an identifying name.
    string name = 1;
    // These are all the alleles at the locus, not just the called ones.
    // Note that a primary reference allele may or may not appear.
    repeated Path allele = 2;
    // These supports are per-allele, matching the alleles above
    repeated Support support = 3;
    // sorted by likelihood or posterior
    //  the first one is the "call"
    repeated Genotype genotype = 4;
    // We also have a Support for the locus overall, because reads may have
    // supported multiple alleles and we want to know how many total there were.
    Support overall_support = 5;
    // We track the likelihood of each allele individually, in addition to
    // genotype likelihoods. Stores the likelihood natural logged.
    repeated double allele_log_likelihood = 6;
}

// Describes a genotype at a particular locus.
message Genotype {
    // These refer to the offsets of the alleles in the Locus object.
    repeated int32 allele = 1;
    bool is_phased = 2;
    double likelihood = 3;
    double log_likelihood = 4; // Likelihood natural logged.
    double log_prior = 5; // Prior natural logged.
    double log_posterior = 6; // Posterior natural logged (unnormalized).
}

// Aggregates information about the reads supporting an allele.
message Support {
    // The overall quality of all the support, as -10 * log10(P(all support is wrong))
    double quality = 1;
    // The number of supporting reads on the forward strand (which may be fractional)
    double forward = 2;
    // The number of supporting reads on the reverse strand (which may be fractional)
    double reverse = 3;
    // TODO: what is this?
    double left = 4;
    // TODO: What is this?
    double right = 5;
}

// Support pinned to a location, which can be either a node or an edge
message LocationSupport {
    // The support
    Support support = 1;
    // The location
    oneof oneof_location {
        Edge edge = 2;
        int64 node_id = 3;
    }
}

// Translations map from one graph to another.
// A collection of these provides a covering mapping between a from and to graph.
// If each "from" path through the base graph corresponds to a "to" path in an updated graph,
// then we can use these translations to project positions, mappings, and paths in the new
// graph into the old one using the Translator interface.
message Translation {
    Path from = 1;
    Path to = 2;
}