// Copyright (c) 2012 Stanford University // // Permission to use, copy, modify, and distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. package LogCabin.Protocol.Raft; /** * \file * This file contains the formats for RPCs between servers using the Raft * protocol. */ /** * Identifies which RPC is being executed. */ enum OpCode { // Keep these numbers dense. REQUEST_VOTE = 1; APPEND_ENTRIES = 2; INSTALL_SNAPSHOT = 3; }; /** * The type of "service-specific error" replies that this service returns. */ message Error { // No errors are defined. //enum Code { //}; //required Code error_code = 1; } /** * A server in a configuration. */ message Server { /** * The ID of the server. */ required uint64 server_id = 1; /** * The network address(es) of the server (comma-delimited). */ required string addresses = 2; } /** * A stable configuraton, in which a simple majority constitutes a quorum. */ message SimpleConfiguration { repeated Server servers = 1; } /** * A stable or transitional configuration. */ message Configuration { /** * The servers in a stable configuration, or the old servers in a * transitional configuration. */ required SimpleConfiguration prev_configuration = 1; /** * Not present in a stable configuration, or the new servers in a * transitional configuration. */ optional SimpleConfiguration next_configuration = 2; } /** * The type of payload stored in a log entry. */ enum EntryType { /** * This must be the first value in the enum and is never assigned * explicitly. If new values are added, old code will see them as set * and equal to this value (though they will still serialize to the * correct value). */ UNKNOWN = 0; /** * Stores a 'Configuration', to be used internally by the consensus module. */ CONFIGURATION = 1; /** * Stores a command to be processed by the state machine. */ DATA = 2; /** * No op. */ NOOP = 3; }; /** * Log entry. */ message Entry { /** * The term in which the entry was first created. */ required uint64 term = 1; /** * The index for the entry. It's truly optional (if you receive an entry * over the network, you shouldn't rely on this). It's used for followers * to check that they've got the entry at the right index and also used * internally in some storage backends such as SegmentedLog. */ optional uint64 index = 5; /** * Roughly the number of nanoseconds the cluster has had a leader. This * gets passed onto state machines, where it's used for cleaning up client * sessions. See ClusterClock for more details. */ required uint64 cluster_time = 6; /** * See EntryType. */ optional EntryType type = 2; // the following are mutually exclusive, depending on 'type' /** * A Configuration to be used internally by the consensus module. */ optional Configuration configuration = 3; /** * A command to be processed by the state machine. */ optional bytes data = 4; } /** * RequestVote RPC: ask a server for its vote in an election and/or get a * server's log information. */ message RequestVote { message Request { /** * ID of the caller, so that if it re-requests the vote from the * callee, the server will respond granted. */ required uint64 server_id = 1; /** * Caller's term. */ required uint64 term = 2; /** * Term of last entry in caller's log. * Used to compare log completeness. */ required uint64 last_log_term = 3; /** * ID of last entry in caller's log. * Used to compare log completeness. */ required uint64 last_log_index = 4; } message Response { /** * Callee's term, for caller to update itself. */ required uint64 term = 1; /** * True if the follower granted the candidate its vote, false * otherwise. */ required bool granted = 2; /** * Set to true if the caller's log is as up-to-date as the recipient's. * This isn't presently used, but it's easy to return and might be * useful later. In the Pre-Vote extension to Raft described in Section * 9.6 "Preventing disruptions when a server rejoins the cluster" of * Diego Ongaro's PhD dissertation, a server needs to know if its log * is sufficient to get elected before actually starting an election. */ optional bool log_ok = 3; } } /** * AppendEntries RPC: replicate log entries to a follower. */ message AppendEntries { message Request { /** * ID of leader (caller), so the follower can redirect clients. */ required uint64 server_id = 1; /** * Caller's term. */ required uint64 term = 2; /** * ID of entry preceding entries, or ID of last log entry for * heartbeat. */ required uint64 prev_log_index = 3; /** * Term of prev_log_index. */ required uint64 prev_log_term = 4; /** * Log entries for follower to store, or empty for heartbeat. */ repeated Entry entries = 5; /** * Last committed entry that the follower has, so the follower can * advance its state machine. */ required uint64 commit_index = 6; } message Response { /** * Callee's term, for the caller to update itself. */ required uint64 term = 1; /** * True if new entries were added to the log. */ required bool success = 2; /** * The recipient's last log index (after it's applied this RPC's * changes to the log). This is used to speed up finding the correct * value for nextIndex with a follower that is far behind the leader. */ optional uint64 last_log_index = 3; message ServerCapabilities { /** * The server's state machine can behave like all state machine * versions between min_supported_state_machine_version and * max_supported_state_machine_version, inclusive. */ optional uint32 min_supported_state_machine_version = 1; /** * The server's state machine can behave like all state machine * versions between min_supported_state_machine_version and * max_supported_state_machine_version, inclusive. */ optional uint32 max_supported_state_machine_version = 2; } /** * Sent back to inform leader of what code the recipient is running. */ optional ServerCapabilities server_capabilities = 4; } } /** * InstallSnapshot RPC: replicate part of a snapshot file to a follower. */ message InstallSnapshot { message Request { /** * ID of leader (caller), so the follower can redirect clients. */ required uint64 server_id = 1; /** * Caller's term. */ required uint64 term = 3; /** * The snapshot covers log entries in the range [1, lastSnapshotIndex]. * While this information can be found in the snapshot itself, a * follower may want to know sooner (for example, to name the snapshot * file or to assert that it's not overwriting something more * important). */ required uint64 last_snapshot_index = 4; /** * The byte offset where 'data' belongs in the file. Followers can * expect this to grow without gaps, but they should use this to drop * duplicate request messages. */ required uint64 byte_offset = 5; /** * Raw bytes of the snapshot file. This should be big enough to achieve * reasonable throughput without having to pipeline RPCs. */ required bytes data = 6; /** * Set to true if this is the last chunk of the file and the follower * should now load the contents; false otherwise. */ required bool done = 7; /** * Explains which version of this RPC the leader (caller) supports. * - Servers speaking version 1 of this RPC (corresponding to LogCabin * release v1.0.0 and up to a few weeks after) did not set this * field. * - Version 2 introduced the bytes_stored field in responses. Before * this, leaders assumed that InstallSnapshot always succeeded if the * term matched. */ optional uint32 version = 8; } message Response { /** * Callee's term, for the caller to update itself. */ required uint64 term = 1; /** * The total number of bytes in the snapshot that the follower has * stored (after applying the request). * * This was introduced to fix * https://github.com/logcabin/logcabin/issues/174 . * Before, followers could only receive snapshot chunks in sequence. If * they restarted, they'd have discarded their previous chunks and they * would just repeatedly PANIC. This is now used to signal to the * leader how many bytes the follower actually has saved. The leader * considers the snapshot transfer complete when bytes_stored equals * the full size of the snapshot. The leader should use bytes_stored * as the value for byte_offset in the next request (most importantly, * when a follower reboots, it returns 0 here and the leader starts at * offset 0 in the next request). * * Leaders that do not support InstallSnapshot version 2 entirely * ignore this field. */ optional uint64 bytes_stored = 2; } }