// Copyright 2014 The Cockroach Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or // implied. See the License for the specific language governing // permissions and limitations under the License. syntax = "proto3"; package cockroach.roachpb; option go_package = "roachpb"; import "roachpb/metadata.proto"; import "storage/engine/enginepb/mvcc.proto"; import "storage/engine/enginepb/mvcc3.proto"; import "util/hlc/timestamp.proto"; import "gogoproto/gogo.proto"; // Span is a key range with an inclusive start Key and an exclusive end Key. message Span { option (gogoproto.equal) = true; option (gogoproto.goproto_stringer) = false; option (gogoproto.populate) = true; reserved 1, 2; // The start key of the key range. bytes key = 3 [(gogoproto.casttype) = "Key"]; // The end key of the key range. The value is empty if the key range // contains only a single key. Otherwise, it must order strictly after Key. // In such a case, the Span encompasses the key range from Key to EndKey, // including Key and excluding EndKey. bytes end_key = 4 [(gogoproto.casttype) = "Key"]; } // ValueType defines a set of type constants placed in the "tag" field of Value // messages. These are defined as a protocol buffer enumeration so that they // can be used portably between our Go and C code. The tags are used by the // RocksDB Merge Operator to perform specialized merges. enum ValueType { // This is a subset of the SQL column type values, representing the underlying // storage for various types. The DELIMITED_foo entries each represent a foo // variant that self-delimits length. UNKNOWN = 0; reserved 7; INT = 1; FLOAT = 2; BYTES = 3; DELIMITED_BYTES = 8; TIME = 4; DECIMAL = 5; DELIMITED_DECIMAL = 9; DURATION = 6; // TUPLE represents a DTuple, encoded as repeated pairs of varint field number // followed by a value encoded Datum. TUPLE = 10; BITARRAY = 11; // TIMESERIES is applied to values which contain InternalTimeSeriesData. TIMESERIES = 100; } // Value specifies the value at a key. Multiple values at the same key are // supported based on timestamp. The data stored within a value is typed // (ValueType) and custom encoded into the raw_bytes field. A custom encoding // is used instead of separate proto fields to avoid proto overhead and to // avoid unnecessary encoding and decoding as the value gets read from disk and // passed through the network. The format is: // // <4-byte-checksum><1-byte-tag> // // A CRC-32-IEEE checksum is computed from the associated key, tag and encoded // data, in that order. // // TODO(peter): Is a 4-byte checksum overkill when most (all?) values // will be less than 64KB? message Value { option (gogoproto.equal) = true; // raw_bytes contains the encoded value and checksum. bytes raw_bytes = 1; // Timestamp of value. util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; } // KeyValue is a pair of Key and Value for returned Key/Value pairs // from ScanRequest/ScanResponse. It embeds a Key and a Value. message KeyValue { bytes key = 1 [(gogoproto.casttype) = "Key"]; Value value = 2 [(gogoproto.nullable) = false]; } // A StoreIdent uniquely identifies a store in the cluster. The // StoreIdent is written to the underlying storage engine at a // store-reserved system key (KeyLocalIdent). message StoreIdent { bytes cluster_id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ClusterID", (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"]; int32 node_id = 2 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; int32 store_id = 3 [(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"]; } // A SplitTrigger is run after a successful commit of an AdminSplit // command. It provides the updated left hand side of the split's // range descriptor (left_desc) and the new range descriptor covering // the right hand side of the split (right_desc). This information // allows the final bookkeeping for the split to be completed and the // new range put into operation. message SplitTrigger { option (gogoproto.equal) = true; RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false]; RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false]; reserved 3; } // A MergeTrigger is run after a successful commit of an AdminMerge // command. It provides the updated left hand side of the split's // range descriptor (left_desc) that now encompasses what was // originally both ranges and the soon-to-be-invalid range descriptor // that used to cover the subsumed, right hand side of the merge // (right_desc). This information allows the final bookkeeping for the // merge to be completed and put into operation. message MergeTrigger { option (gogoproto.equal) = true; RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false]; RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false]; reserved 3; storage.engine.enginepb.MVCCStats right_mvcc_stats = 4 [ (gogoproto.customname) = "RightMVCCStats", (gogoproto.nullable) = false ]; // FreezeStart is a timestamp that is guaranteed to be greater than the // timestamps at which any requests were serviced by the responding replica // before it stopped responding to requests altogether (in anticipation of // being subsumed). It is suitable for use as the timestamp cache's low water // mark for the keys previously owned by the subsumed range. util.hlc.Timestamp freeze_start = 5 [(gogoproto.nullable) = false]; } // ReplicaChangeType is a parameter of ChangeReplicasTrigger. enum ReplicaChangeType { option (gogoproto.goproto_enum_prefix) = false; ADD_REPLICA = 0; REMOVE_REPLICA = 1; } message ChangeReplicasTrigger { option (gogoproto.equal) = true; option (gogoproto.goproto_stringer) = false; // TODO(benesch): this trigger should just specify the updated descriptor, // like the split and merge triggers, so that the receiver doesn't need to // reconstruct the range descriptor update. ReplicaChangeType change_type = 1; // The replica being modified. ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false]; // The new replica list with this change applied. repeated ReplicaDescriptor updated_replicas = 3 [(gogoproto.nullable) = false]; int32 next_replica_id = 4 [(gogoproto.customname) = "NextReplicaID", (gogoproto.casttype) = "ReplicaID"]; } // ModifiedSpanTrigger indicates that a specific span has been modified. // This can be used to trigger scan-and-gossip for the given span. message ModifiedSpanTrigger { option (gogoproto.equal) = true; bool system_config_span = 1; // node_liveness_span is set to indicate that node liveness records // need re-gossiping after modification or range lease updates. The // span is set to a single key when nodes update their liveness records // with heartbeats to extend the expiration timestamp. Changes to the // range lease for the range containing node liveness triggers re-gossip // of the entire node liveness key range. Span node_liveness_span = 2; } // InternalCommitTrigger encapsulates all of the internal-only commit triggers. // Only one may be set. message InternalCommitTrigger { option (gogoproto.equal) = true; // InternalCommitTrigger is always nullable, and these getters are // nil-safe, which is often convenient. option (gogoproto.goproto_getters) = true; SplitTrigger split_trigger = 1; MergeTrigger merge_trigger = 2; ChangeReplicasTrigger change_replicas_trigger = 3; ModifiedSpanTrigger modified_span_trigger = 4; } // TransactionStatus specifies possible states for a transaction. enum TransactionStatus { option (gogoproto.goproto_enum_prefix) = false; // PENDING is the default state for a new transaction. Transactions // move from PENDING to one of COMMITTED or ABORTED. Mutations made // as part of a PENDING transactions are recorded as "intents" in // the underlying MVCC model. PENDING = 0; // COMMITTED is the state for a transaction which has been // committed. Mutations made as part of a transaction which is moved // into COMMITTED state become durable and visible to other // transactions, moving from "intents" to permanent versioned // values. COMMITTED = 1; // ABORTED is the state for a transaction which has been aborted. // Mutations made as part of a transaction which is moved into // ABORTED state are deleted and are never made visible to other // transactions. ABORTED = 2; } message ObservedTimestamp { option (gogoproto.equal) = true; option (gogoproto.populate) = true; int32 node_id = 1 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"]; util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; } // A Transaction is a unit of work performed on the database. // Cockroach transactions support two isolation levels: snapshot // isolation and serializable snapshot isolation. Each Cockroach // transaction is assigned a random priority. This priority will be // used to decide whether a transaction will be aborted during // contention. // // If you add fields to Transaction you'll need to update // Transaction.Clone. Failure to do so will result in test failures. message Transaction { option (gogoproto.equal) = true; option (gogoproto.goproto_stringer) = false; option (gogoproto.populate) = true; // The transaction metadata. These are persisted with every intent. storage.engine.enginepb.TxnMeta meta = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; // A free-text identifier for debug purposes. string name = 2; TransactionStatus status = 4; util.hlc.Timestamp last_heartbeat = 5 [(gogoproto.nullable) = false]; // The original timestamp at which the transaction started. For serializable // transactions, if the timestamp drifts from the original timestamp, the // transaction will retry unless we manage to "refresh the reads" - see // refreshed_timestamp. // // This timestamp is the one at which all transactions will read, unless // refreshed_timestamp is set. It is also, surprisingly, the timestamp at // which transactions will provisionally _write_ (i.e. intents are written at // this orig_timestamp and, after commit, when the intents are resolved, // their timestamps are bumped to the to the commit timestamp), if // refreshed_timestamp isn't set. // This is ultimately because of correctness concerns around SNAPSHOT // transactions. // // Intuitively, one could think that the timestamp at which intents should be // written should be the provisional commit timestamp, and while this is // morally true, consider the following scenario, where txn1 is a SNAPSHOT // txn: // // - txn1 at orig_timestamp=5 reads key1: (value) 1. // - txn1 writes elsewhere, has its commit timestamp increased to 20. // - txn2 at orig_timestamp=10 reads key1: 1 // - txn2 increases the value by 5: key1: 6 and commits // - txn1 increases the value by 1: key1: 2, attempts commit // // If txn1 uses its orig_timestamp for updating key1 (as it does), it // conflicts with txn2's committed value (which is at timestamp 10, in the // future of 5), and restarts. // Using instead its candidate commit timestamp, it wouldn't see a conflict // and commit, but this is not the expected outcome (the expected outcome is // {key1: 6} (since txn1 is not expected to commit)) and we would be // experiencing the Lost Update Anomaly. // // Note that in practice, before restarting, txn1 would still lay down an // intent (just above the committed value) not with the intent to commit it, // but to avoid being starved by short-lived transactions on that key which // would otherwise not have to go through conflict resolution with txn1. // // Again, keep in mind that, when the transaction commits, all the intents are // bumped to the commit timestamp (otherwise, pushing a transaction wouldn't // achieve anything). util.hlc.Timestamp orig_timestamp = 6 [(gogoproto.nullable) = false]; // Initial Timestamp + clock skew. Reads which encounter values with // timestamps between timestamp and max_timestamp trigger a txn // retry error, unless the node being read is listed in observed_timestamps // (in which case no more read uncertainty can occur). // The case max_timestamp < timestamp is possible for transactions which have // been pushed; in this case, max_timestamp should be ignored. util.hlc.Timestamp max_timestamp = 7 [(gogoproto.nullable) = false]; // The refreshed timestamp is the timestamp at which the transaction // can commit without necessitating a serializable restart. This // value is forwarded to the transaction's current timestamp (meta.timestamp) // if the transaction coordinator is able to refresh all refreshable spans // encountered during the course of the txn. If set, this take precedence // over orig_timestamp and is the timestamp at which the transaction both // reads and writes going forward. // We need to keep track of both refresh_timestamp and orig_timestamp (instead // of simply overwriting the orig_timestamp after refreshes) because the // orig_timestamp needs to be used as a lower bound timestamp for the // time-bound iterator used to resolve intents - i.e. there can be intents to // resolve up to the timestamp that the txn started with. util.hlc.Timestamp refreshed_timestamp = 15 [(gogoproto.nullable) = false]; // A list of pairs. The list maps NodeIDs to timestamps // as observed from their local clock during this transaction. The purpose of // this map is to avoid uncertainty related restarts which normally occur // when reading a value in the near future as per the max_timestamp field. // // Morally speaking, having an entry for a node in this map means that this // node has been visited before, and that no more uncertainty restarts are // expected for operations served from it. However, this is not entirely // accurate. For example, say a txn starts with orig_timestamp=1 (and some // large max_timestamp). It then reads key "a" from node A, registering an // entry `A -> 5` in the process (`5` happens to be a timestamp taken off // that node's clock at the end of the read). // Now assume that some other transaction writes and commits a value at key "b" // and timestamp 4 (again, served by node A), and our transaction attempts to // read that key. Since there is an entry in its observed_timestamps for A, // our uncertainty window is `[orig_timestamp, 5) = [1, 5)` but the value at // key "b" is in that window, and so we will restart. However, we will restart // with a timestamp that is at least high as our entry in the map for node A, // so no future operation on node A will be uncertain. // // Thus, expressed properly, you could say that when a node has been read from // successfully before, uncertainty on that node is restricted to values with // timestamps in the interval [orig_timestamp, first_visit_timestamp), and // that no node will trigger restarts more than once (and in fact, usually // the first restart also bumps the txn timestamp enough to clear all other // nodes). // // When this list holds a corresponding entry for the node the current // request is executing on, we can run the command with the map's timestamp // as the top boundary of our uncertainty interval, limiting (and often // avoiding) uncertainty restarts. // // When a transaction is first initialized on a node, it may use a timestamp // from the local hybrid logical clock to initialize the corresponding entry // in the map. In particular, if `orig_timestamp` is taken from that node's // clock, we may add that to the map, which eliminates read uncertainty for // reads on that node. // // The list of observed timestamps is kept sorted by NodeID. Use // Transaction.UpdateObservedTimestamp to maintain the sorted order. repeated ObservedTimestamp observed_timestamps = 8 [(gogoproto.nullable) = false]; // Writing is true if the transaction has previously sent a Begin transaction // (i.e. if it ever attempted to perform a write, so if it ever attempted to // leave intents (across retries)). The flag will be set even if the BeginTxn // batch failed. // When set, the AbortCache must be checked by reads so that they don't miss // to see the txn's previous writes. bool writing = 9; // If this is true, the transaction must retry. Relevant only for // SNAPSHOT transactions: a SERIALIZABLE transaction would have to // retry anyway due to its commit timestamp having moved forward (whenever // write_too_old is set, meta.Timestamp has been pushed above orig_timestamp). // This bool is set instead of immediately returning a txn retry // error so that intents can continue to be laid down, minimizing // work required on txn restart. bool write_too_old = 12; // If retry_on_push is true, the transaction must retry in the event // that the commit timestamp is pushed forward. This flag is set if // the transaction contains any calls to DeleteRange, in order to // prevent the LostDeleteRange anomaly. This flag is relevant only // for SNAPSHOT transactions. bool retry_on_push = 13; repeated Span intents = 11 [(gogoproto.nullable) = false]; // Epoch zero timestamp is used to keep track of the earliest timestamp // that any epoch of the transaction used. This is set only if the // transaction is restarted and the epoch is bumped. It is used during // intent resolution to more efficiently scan for intents. util.hlc.Timestamp epoch_zero_timestamp = 14 [(gogoproto.nullable) = false]; // This flag is set if the transaction's original timestamp was // "leaked" beyond the transaction (i.e. if returned via NOW() or // transaction_timestamp()). If true, this prevents optimizations // which commit at a higher timestamp without resorting to a // client-side retry. bool orig_timestamp_was_observed = 16; } // A Intent is a Span together with a Transaction metadata and its status. message Intent { option (gogoproto.equal) = true; Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true]; storage.engine.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false]; TransactionStatus status = 3; } // A SequencedWrite is a point write to a key with a certain sequence number. // // TODO(nvanbenschoten/tschottdorf): This message type can be used as the // PromisedWrites repeated field in EndTransaction in the parallel commits // proposal (#24194). message SequencedWrite { // The key that the write was made at. bytes key = 1 [(gogoproto.casttype) = "Key"]; // The sequence number of the request that created the write. int32 sequence = 2; } // Lease contains information about range leases including the // expiration and lease holder. message Lease { option (gogoproto.goproto_stringer) = false; option (gogoproto.populate) = true; // The start is a timestamp at which the lease begins. This value // must be greater than the last lease expiration or the lease request // is considered invalid. util.hlc.Timestamp start = 1 [(gogoproto.nullable) = false]; // The expiration is a timestamp at which the lease expires. This means that // a new lease can be granted for a later timestamp. util.hlc.Timestamp expiration = 2 [(gogoproto.moretags) = "cockroachdb:\"randnullable\""]; // The address of the would-be lease holder. ReplicaDescriptor replica = 3 [(gogoproto.nullable) = false]; // The start of the lease stasis period. This field is deprecated. util.hlc.Timestamp deprecated_start_stasis = 4 [(gogoproto.moretags) = "cockroachdb:\"randnullable\""]; // The current timestamp when this lease has been proposed. Used after a // transfer and after a node restart to enforce that a node only uses leases // proposed after the time of the said transfer or restart. This is nullable // to help with the rollout (such that a lease applied by some nodes before // the rollout and some nodes after the rollout is serialized the same). // TODO(andrei): Make this non-nullable after the rollout. util.hlc.Timestamp proposed_ts = 5 [(gogoproto.customname) = "ProposedTS"]; // The epoch of the lease holder's node liveness entry. If this value // is non-zero, the start and expiration values are ignored. int64 epoch = 6; // A zero-indexed sequence number which is incremented during the acquisition // of each new range lease that is not equivalent to the previous range lease // (i.e. an acquisition that implies a leaseholder change). The sequence // number is used to detect lease changes between command proposal and // application without requiring that we send the entire lease through Raft. // Lease sequence numbers are a reflection of the "lease equivalency" property // (see Lease.Equivalent). Two adjacent leases that are equivalent will have // the same sequence number and two adjacent leases that are not equivalent // will have different sequence numbers. int64 sequence = 7 [(gogoproto.casttype) = "LeaseSequence"]; } // AbortSpanEntry contains information about a transaction which has // been aborted. It's written to a range's AbortSpan if the range // may have contained intents of the aborted txn. In the event that // the same transaction attempts to read keys it may have written // previously, this entry informs the transaction that it has aborted // and must start fresh with an updated priority. message AbortSpanEntry { option (gogoproto.equal) = true; option (gogoproto.populate) = true; // The key of the associated transaction. bytes key = 1 [(gogoproto.casttype) = "Key"]; // The candidate commit timestamp the transaction record held at the time // it was aborted. util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false]; // The priority of the transaction. int32 priority = 3; } // TxnCoordMeta is metadata held by a transaction coordinator. This // message is defined here because it is used in several layers of the // system (internal/client, sql/distsqlrun, kv). message TxnCoordMeta { // txn is a copy of the transaction record, updated with each request. Transaction txn = 1 [(gogoproto.nullable) = false]; // intents stores key spans affected by this transaction through // this coordinator. These spans allow the coordinator to set the // list of intent spans in the EndTransactionRequest when the // transaction is finalized. repeated Span intents = 2 [(gogoproto.nullable) = false]; // command_count indicates how many requests have been sent through // this transaction. Reset on retryable txn errors. int32 command_count = 3; // refresh_reads and refresh_writes store key spans which were read // or, less frequently, written during a transaction. These fields // are utilized for SERIALIZABLE transactions in the event a // transaction experiences a retry error. In that case, the // coordinator uses the Refresh and RefreshRange RPCs to verify that // no write has occurred to the spans more recently than the txn's // original timestamp, and updates the affected timestamp caches to // the transaction's refreshed timestamp. On failure, the retry // error is propagated. On success, the transaction's original and // current timestamps are forwarded to the refresh timestamp, and // the transaction can continue. repeated Span refresh_reads = 4 [(gogoproto.nullable) = false]; repeated Span refresh_writes = 5 [(gogoproto.nullable) = false]; // refresh_invalid indicates that spans were discarded or not collected // (i.e. because of a dist SQL processor running a version before refreshing // was introduced). This is false if all spans encountered during the // transaction which need refreshing have been collected to the refresh_reads // and refresh_writes span slices. bool refresh_invalid = 7; // deprecated_refresh_valid is the inverse of refresh_invalid. It was // deprecated in favor of refresh_invalid in order to give the struct a useful // zero value. // TODO(nvanbenschoten): Can be removed in 2.2. bool deprecated_refresh_valid = 6; // outstanding_writes stores all writes that are outstanding and have // not yet been resolved. Any client wishing to send a request that // overlaps with them must chain on to their success using a QueryIntent // request. repeated SequencedWrite outstanding_writes = 8 [(gogoproto.nullable) = false]; }