syntax = "proto3";

import "google/protobuf/struct.proto";
import "google/protobuf/timestamp.proto";

package eu.deepslate.realtime.speeq;

/*
 * Enumeration of supported audio sample formats.
 */
enum SampleFormat {
  // 8-bit unsigned integer samples.
  UNSIGNED_8_BIT = 0;

  // 16-bit signed integer samples.
  SIGNED_16_BIT = 1;

  // 32-bit signed integer samples.
  SIGNED_32_BIT = 2;

  // 32-bit floating point samples (0.0 to 1.0).
  FLOAT_32_BIT = 3;

  // 64-bit floating point samples (0.0 to 1.0).
  FLOAT_64_BIT = 4;
}

/*
 * Definition of a tool that can be used by the model during inference.
 */
message ToolDefinition {
  // Name of the tool.
  //
  // This is what the model will use to refer to the tool when requesting its use.
  string name = 1;

  // Description of the tool's purpose and functionality.
  //
  // When not present, this will either be omitted or an empty string
  // depending on the inference engine's capabilities.
  optional string description = 2;

  // Parameters schema for the tool.
  //
  // This needs to be a valid JSON Schema object. Most inference engines will requires
  // the top level value to be of type "object".
  google.protobuf.Struct parameters = 3;
}

/*
 * Request message to update the tool definitions.
 *
 * This always completely replaces the existing tool definitions
 * with the provided ones, as in, it does not merge with existing definitions.
 *
 * An empty list of tool definitions will clear all existing tool definitions.
 */
message UpdateToolDefinitionsRequest {
  // List of tool definitions to use from now on.
  repeated ToolDefinition tool_definitions = 1;
}

/*
 * Request message to call a tool.
 *
 * Depending on the inference engine, you may get multiple requests to call tools
 * before getting a final response.
 *
 * Every request **must** be responded to with a corresponding ToolCallResponse, even
 * if tool execution fails or is not possible for some reason.
 */
message ToolCallRequest {
  // Identifier of the tool to call.
  //
  // This does **not** identify the tool, but the request to call a specific tool.
  string id = 1;

  // Name of the tool to call.
  string name = 2;

  // Parameters to pass to the tool.
  google.protobuf.Struct parameters = 3;

  // Session-local ID of the assistant turn that issued this tool call.
  optional uint32 turn_id = 4;
}

/*
 * Response message for a tool call.
 */
message ToolCallResponse {
  // Identifier of the tool call request this response corresponds to.
  string id = 1;

  // Result of the tool call.
  //
  // This _can_ be JSON, but it doesn't have to be. As long as the model is able to
  // understand the result, any format is acceptable.
  string result = 2;
}

/*
 * Duration message representing a span of time.
 */
message Duration {
  // Whole seconds of the duration.
  uint64 seconds = 1;

  // Nanoseconds component of the duration.
  uint32 nanos = 2;
}

/*
 * Configuration for an audio line (input or output depending on context).
 *
 * The service always assumes that audio is raw PCM with 16-bit samples.
 */
message AudioLineConfiguration {
  // Sample rate in Hz.
  uint32 sample_rate = 1;

  // Number of audio channels.
  uint32 channel_count = 2;

  // Sample format.
  SampleFormat sample_format = 3;
}

message VadConfiguration {
  // Minimum confidence required to consider audio as speech (0.0 to 1.0).
  float confidence_threshold = 1;

  // Minimum volume level to consider audio as speech (0.0 to 1.0).
  float min_volume = 2;

  // Duration of speech to detect start of speech.
  Duration start_duration = 3;

  // Duration of silence to detect end of speech.
  Duration stop_duration = 4;

  // Duration of audio to buffer before speech detection.
  // This allows capturing audio from before speech starts.
  //
  // Recommended duration is 1 second.
  Duration backbuffer_duration = 5;
}

/*
 * State machine output of the VAD pipeline.
 *
 * The state machine debounces raw VAD confidence/volume signals using the
 * `start_duration` / `stop_duration` thresholds in `VadConfiguration`.
 */
enum VadState {
  /*
   * No speech is detected at all.
   */
  SILENCE = 0;

  /*
   * Speech is actively detected above the configured thresholds, but the `start_duration` has not yet been met.
   */
  SPEECH_STARTING = 1;

  /*
   * Speech is actively detected and the `start_duration` threshold has been met.
   */
  SPEECH = 2;

  /*
   * Speech was detected but has now stopped, but the `stop_duration` threshold has not yet been met.
   */
  SPEECH_ENDING = 3;
}

/*
 * Per-frame VAD signal. Frames are emitted at the VAD frame rate
 * (typically 50 Hz at 20ms frames on 16 kHz audio). Only emitted when
 * `enable_vad_frame_telemetry` is true on the InitializeSessionRequest.
 *
 * Frame indexing is monotonic per session and reflects the post-resampling
 * frame stream that the VAD engine actually saw.
 *
 * This is experimental and may be removed or changed without a major version bump. It is primarily intended for
 * debugging and telemetry purposes, and should not be relied on for critical functionality.
 */
message VadAnalysisFrame {
  // Monotonic frame index per session.
  uint64 frame_index = 1;

  // Time elapsed since the input pipeline started, at the boundary of this
  // frame. This is wall-clock time, not audio-stream time — useful for
  // aligning the telemetry stream with other observations.
  Duration session_time = 2;

  // Raw confidence from the underlying engine (0.0..1.0).
  float confidence = 3;

  // Raw RMS volume from the audio frame (0.0..1.0). 1.0 if the engine
  // does not provide volume.
  float volume = 4;

  // Current state-machine state at the END of this frame.
  VadState state = 5;

  // Packet IDs whose audio bytes (after resampling) contributed to this
  // frame, in order of contribution. Usually a single ID; multiple at
  // packet boundaries or when the client sent very small packets. Empty
  // only during resampler warmup before any tagged samples reach the VAD.
  repeated uint64 source_packet_ids = 6;
}

/*
 * State-machine transition event.
 *
 * Emitted whenever the VAD state changes. Always emitted (independent of
 * `enable_vad_frame_telemetry`) so VAD-only consumers and conversational
 * debug clients both get turn boundaries with packet correlation.
 */
message VadStateEvent {
  // Time elapsed since the input pipeline started, at the moment of the
  // transition.
  Duration session_time = 1;

  // The state the machine was in before the transition.
  VadState from_state = 2;

  // The state the machine is in after the transition.
  VadState to_state = 3;

  // The packet_id being processed when the transition triggered. For
  // SpeechStarted-class transitions, this is the packet whose audio caused
  // VAD to cross the threshold. For SpeechStopped-class transitions, the
  // most recent packet contributing to the silence-detection window.
  uint64 packet_id = 4;
}

/*
 * Configuration for inference engine.
 */
message InferenceConfiguration {
  // System prompt to guide the model's behavior.
  string system_prompt = 1;

  // Temperature setting for the model.
  //
  // This controls the randomness of the model's output. Higher values
  // produce more random output, while lower values produce more deterministic output.
  double temperature = 2;
}

/*
 * Configuration for ElevenLabs TTS provider.
 */
message ElevenLabsTtsConfiguration {
  // ElevenLabs API key for this session
  string api_key = 1;

  // Voice ID to use (e.g., "21m00Tcm4TlvDq8ikWAM" for Rachel)
  string voice_id = 2;

  // Model ID (e.g., "eleven_turbo_v2"). If not set, uses default.
  optional string model_id = 3;

  // Optional voice settings for fine-tuning TTS output.
  optional ElevenLabsVoiceSettings voice_settings = 4;

  // ElevenLabs service location to use for this session.
  ElevenLabsLocation location = 5;
}

/*
 * Configuration for the ElevenLabs voice.
 */
message ElevenLabsVoiceSettings {
  // Stability for the voice
  double stability = 1;

  // Similarity boost for the voice
  double similarity_boost = 2;

  // Style setting for v2 models
  double style = 3;

  // Whether to apply speaker boost
  bool use_speaker_boost = 4;

  // Speed setting for the voice
  double speed = 5;
}

/*
 * Enumeration of ElevenLabs TTS service locations.
 *
 * See here: https://elevenlabs.io/docs/overview/administration/data-residency
 */
enum ElevenLabsLocation {
  // United States
  //
  // This is the default location and the one that is accessed via https://elevenlabs.io/.
  US = 0;

  // European Union
  //
  // Requires enterprise access to ElevenLabs.
  EU = 1;

  // India
  //
  // Requires enterprise access to ElevenLabs.
  INDIA = 2;
}

/*
 * Configuration for hosted TTS provider (voice cloning) using a deepslate
 * provided voice.
 */
message HostedVoiceRef {
  string voice_id = 1;
}

/*
 * Configuration for hosted TTS provider (voice cloning) using a custom voice.
 */
message HostedVoiceCloneV1 {
  /*
   * Raw audio data for the voice sample.
   *
   * This should be between 20 and 25 seconds of speech. See our documentation
   * for more information.
   */
  bytes audio_data = 1;

  /*
   * Audio format for the voice sample.
   */
  AudioLineConfiguration audio_format = 2;

  /*
   * Exact transcript of the voice sample, including disfluencies, false starts, etc.
   */
  string ref_text = 3;
}

/*
 * Quality/latency mode for hosted TTS.
 *
 * High quality prefers higher quality output over response speed,
 * low latency may degrade output but answer faster.
 */
enum HostedTtsMode {
  /*
   * Use a high quality generation mode which takes more time,
   * but produces high quality output.
   *
   * This is the recommended mode, as latency is still relatively low
   * next to being nearly imperceptible with a significantly better
   * quality.
   */
  HIGH_QUALITY = 0;

  /*
   * Use a low latency generation mode.
   *
   * Output quality may be significantly degraded, but speech generation
   * takes next to no time to complete.
   */
  LOW_LATENCY = 1;
}

message HostedTtsConfiguration {
  oneof voice {
    HostedVoiceRef voice_ref = 1;
    HostedVoiceCloneV1 voice_clone_v1 = 3;
  }
  HostedTtsMode mode = 2;
}

/*
 * Configuration for text-to-speech output.
 *
 * If not specified, raw text fragments are sent to the client.
 */
message TtsConfiguration {
  oneof provider {
    ElevenLabsTtsConfiguration eleven_labs = 1;
    HostedTtsConfiguration hosted = 2;
  }
}

/*
 * First message sent to initialize a session.
 *
 * This message must always be the first message sent from the client to the service
 */
message InitializeSessionRequest {
  // Configuration for the input audio line.
  AudioLineConfiguration input_audio_line = 1;

  // Configuration for the output audio line.
  AudioLineConfiguration output_audio_line = 2;

  // VAD configuration for speech detection.
  VadConfiguration vad_configuration = 3;

  // Inference engine configuration.
  InferenceConfiguration inference_configuration = 4;

  // TTS configuration. If not specified, raw text fragments are sent to the client.
  optional TtsConfiguration tts_configuration = 5;

  // Whether the client supports playback position reporting.
  //
  // When true, the client will send PlaybackPositionReport messages
  // with precise playback position data. When false (default), the
  // server uses elapsed-time estimation for context truncation.
  bool supports_playback_reporting = 6;

  // When true, the server emits per-frame VadAnalysisFrame messages
  // (~50 Hz). Off by default for cost/bandwidth reasons. State transitions
  // (VadStateEvent) are emitted unconditionally regardless of this flag.
  //
  // This flag is considered experimental and may be removed or changed without a major version bump.
  bool enable_vad_frame_telemetry = 7;
}

/*
 * Request to reconfigure an ongoing session.
 *
 * This may be useful to change audio input settings on the fly.
 * Note that corvidae does not guarantee that this happens in a
 * seamless manner, there may be glitches or dropped audio.
 */
message ReconfigureSessionRequest {
  optional AudioLineConfiguration input_audio_line = 1;

  optional InferenceConfiguration inference_configuration = 2;
}

/*
 * Enumeration for inference trigger modes.
 */
enum InferenceTriggerMode {
  // Don't trigger inference automatically.
  NO_TRIGGER = 0;

  // Queue inference to happen as soon as current inference is
  // done (or immediately if no inference is ongoing).
  QUEUE = 1;

  // Interrupt current inference and start new inference immediately.
  IMMEDIATE = 2;
}

/*
 * Audio data wrapper.
 */
message AudioData {
  // Raw audio data bytes. Must confirm to the AudioLineConfiguration for the respective line.
  bytes data = 1;
}

/*
 * Text data wrapper.
 */
message TextData {
  // Raw text data.
  string data = 1;
}

message UserInput {
  // Packet identifier for tracking purposes.
  //
  // Corvidae doesn't care too much about this value, but it will use it to
  // refer to specific packets in responses or events. The client is free
  // to choose any numbering scheme, including non-sequential or random values.
  uint64 packet_id = 1;

  // Inference trigger mode for this input.
  InferenceTriggerMode mode = 2;

  oneof input {
    // Audio data input.
    AudioData audio_data = 3;

    // Text data input.
    TextData text_data = 4;
  }
}

/*
 * Message to trigger inference processing manually and right now.
 *
 * This instructs the inference engine to take the current input and
 * process it immediately, instead of waiting for natural pauses or
 * end-of-input signals.
 *
 * Words of caution: This is useful for generating a greeting or similar
 * immediate response, but model behavior may be unpredictable if used
 * directly after a model response. Corvidae will do its best to patch
 * things together, but there are no guarantees. Use for generating a
 * greeting is the primary intended use case and fully supported, other
 * use cases may or may not work as expected.
 */
message TriggerInference {
  // Optional extra instructions to guide the inference.
  //
  // This can be used to provide context or specific directives
  // to the model for this inference trigger.
  optional string extra_instructions = 1;

  // When true, flush whatever the VAD pipeline currently has buffered and
  // append it as a user audio message before triggering inference.
  //
  // Use this to manually commit speech the VAD has not yet released — e.g.
  // when the trigger is driven by an external signal (push-to-talk button,
  // wake word) rather than waiting for natural end-of-speech detection.
  // When the buffer is empty or the deployment doesn't run a VAD pipeline,
  // this is a no-op.
  bool flush_vad = 2;
}

/*
 * Request to export the chat history
 */
message ExportChatHistoryRequest {
  // When true, wait for all in-flight async operations (e.g. transcriptions)
  // to complete before responding with the chat history.
  bool await_pending = 1;

  // Whether to exclude audio data from the exported chat history. Defaults to false.
  //
  // Use this if you only need the transcripts and want to avoid transferring large audio data blobs.
  bool exclude_audio = 2;
}

/*
 * Client-reported playback position.
 *
 * Sent by clients that support playback reporting to provide
 * precise information about how many audio bytes have been played.
 * This enables accurate context truncation on user interrupt.
 *
 * Only sent when the client declares `supports_playback_reporting = true`
 * in the InitializeSessionRequest.
 */
message PlaybackPositionReport {
  // Number of audio bytes played by the client.
  uint64 bytes_played = 1;
}

/*
 * Direct speech directive.
 *
 * Instructs the service to speak the given text directly via TTS,
 * bypassing the LLM. Behaves like a user interruption: any active
 * inference is cancelled and buffered audio is cleared before the
 * new text is spoken.
 */
message DirectSpeech {
  // Text to speak.
  string text = 1;

  // Whether this spoken text should be included as an assistant message
  // in the chat history or not. When false, the text is spoken,
  // but the LLM doesn't know that it was spoken. The message will be marked
  // as ephemeral in the chat history.
  bool include_in_history = 2;
}

/*
 * Conversation query request.
 *
 * Prompts the LLM with the conversation history and a custom instruction,
 * returning a complete text result. The query runs on a separate inference
 * context and does not modify the main conversation.
 *
 * At least one of prompt or instructions should be provided.
 */
message ConversationQuery {
  // Replaces the system prompt for this one-shot inference call.
  // If absent, uses the session's current system prompt.
  optional string prompt = 1;

  // Appended as instructions after the conversation turns.
  // If absent, no extra instructions are appended.
  optional string instructions = 2;
}

/*
 * Result of a conversation query.
 */
message ConversationQueryResult {
  // The LLM's complete response text.
  string text = 1;
}

/*
 * Message sent from the client to the service.
 */
message ServiceBoundMessage {
  oneof payload {
    InitializeSessionRequest initialize_session_request = 1;
    ReconfigureSessionRequest reconfigure_session_request = 2;
    UserInput user_input = 3;
    UpdateToolDefinitionsRequest update_tool_definitions_request = 4;
    ToolCallResponse tool_call_response = 5;
    TriggerInference trigger_inference = 6;
    ExportChatHistoryRequest export_chat_history_request = 7;
    PlaybackPositionReport playback_position_report = 8;
    DirectSpeech direct_speech = 9;
    ConversationQuery conversation_query = 10;
  }
}

/*
 * A fragment of text from the model, streamed as tokens arrive.
 */
message ModelTextFragment {
  // The text content of this fragment.
  string text = 1;
}

/*
 * An audio chunk from the model (TTS output).
 */
message ModelAudioChunk {
  // The audio data
  AudioData audio = 1;

  // Optional: text that was spoken (alignment data from TTS provider)
  optional string transcript = 2;
}

/*
 * Exported chat history.
 */
message ChatHistory {
  repeated ChatMessage messages = 1;
}

/*
 * Role of a chat message.
 */
enum ChatMessageRole {
  /*
   * System message (usually the system prompt).
   */
  SYSTEM = 0;

  /*
   * User message.
   */
  USER = 1;

  /*
   * Assistant message.
   */
  ASSISTANT = 2;
}

/*
 * Delivery status of a chat message.
 */
enum ChatDeliveryStatus {
  // Turn is still being generated.
  DELIVERY_IN_PROGRESS = 0;

  // All content was delivered to the client.
  DELIVERY_COMPLETE = 1;

  // User interrupted. Content reflects what was actually delivered.
  DELIVERY_INTERRUPTED = 2;
}

/*
 * Message in the chat history.
 */
message ChatMessage {
  /*
   * The role of the entity that this message is attributed to.
   */
  ChatMessageRole role = 1;

  /*
   * Ordered content of this message.
   *
   * For assistant messages with TTS, text entries include the synthesized audio.
   * Content blocks are ordered as produced — tool calls are interleaved with
   * text at the positions they were emitted by the model.
   */
  repeated ChatMessageContent content = 2;

  /*
   * Delivery status of this message.
   */
  ChatDeliveryStatus delivery_status = 3;

  /*
   * Whether this message is ephemeral.
   *
   * Ephemeral messages were spoken to the user but are
   * not part of the LLM's context.
   */
  bool ephemeral = 4;

  /*
   * When this message was started (turn creation time).
   */
  google.protobuf.Timestamp created_at = 5;

  /*
   * The turn ID. Correlates with `ResponseBegin.turn_id` / `ResponseEnd.turn_id`
   * for assistant turns and `UserTranscriptionResult.turn_id` for user audio
   * turns.
   */
  optional uint32 turn_id = 6;

  /*
   * If this turn was truncated from the LLM's context window, the turn_id of
   * the assistant response that was first generated without this turn in context.
   *
   * Absent means the turn is still in context (or was never truncated).
   * When present, you can look up the referenced response turn's `created_at`
   * to determine the wall-clock time of truncation.
   */
  optional uint32 truncated_at_response_turn_id = 7;
}

/*
 * A single content block within a chat message.
 */
message ChatMessageContent {
  oneof content {
    // Text content, optionally with TTS-synthesized audio.
    ChatTextContent text_content = 1;

    // User input or model-output audio (not TTS-synthesized).
    ChatAudioData input_audio = 2;

    // Internal model reasoning / chain-of-thought.
    string thoughts = 3;

    // Tool call requested by the model.
    ToolCallRequest tool_call = 4;

    // Tool execution result.
    ToolCallResponse tool_result = 5;

    // Model instructions (e.g. directives injected via trigger_inference)
    string instructions = 6;
  }
}

/*
 * Text content with optional TTS-synthesized audio.
 *
 * When TTS is active, each synthesized sentence becomes a ChatTextContent
 * entry with both text and tts_audio populated. Text without TTS audio
 * (e.g., when TTS is not configured) has tts_audio absent.
 */
message ChatTextContent {
  // The text content.
  string text = 1;

  // TTS-synthesized audio for this text, if available.
  optional ChatAudioData tts_audio = 2;
}

/*
 * Self-describing audio data (audio bytes + format metadata).
 *
 * Composes existing AudioData and AudioLineConfiguration types so
 * consumers can decode the audio without out-of-band format knowledge.
 *
 * Please be careful when using this data and actually look at the format
 * if you reconfigure the audio pipeline, as then it is possible for it
 * to change mid conversation.
 */
message ChatAudioData {
  // Raw audio bytes.
  AudioData audio = 1;

  // Audio format (sample rate, channels, sample format).
  AudioLineConfiguration format = 2;

  // Transcription of the audio content (when available).
  // Populated asynchronously for user audio turns.
  string transcription = 3;
}

/*
 * Notification to clear the playback buffer.
 *
 * This is sent when the user starts speaking, regardless of whether
 * there is ongoing TTS playback or not. In other words, the event is
 * sent pro-actively to ensure that any ongoing playback is stopped.
 *
 * This might be sent multiple times if the user interrupts multiple times.
 *
 * This is soft deprecated in favor of the more informative VadStateEvent stream, but is still sent for backward
 * compatibility and ease of use for simple clients.
 */
message PlaybackClearBuffer {}

/*
 * Notification that the model has begun its response.
 */
message ResponseBegin {
  /*
   * Session-local ID of the assistant turn this response begins. Matches the
   * `ChatMessage.turn_id` for the assistant turn that gets exported with the
   * eventual chat history. The same value is sent on the matching
   * `ResponseEnd`.
   */
  uint32 turn_id = 1;
}

/*
 * Notification that the model has finished its response.
 */
message ResponseEnd {
  /*
   * Session-local ID of the assistant turn this response ends. Matches the
   * `turn_id` from the preceding `ResponseBegin`.
   */
  uint32 turn_id = 1;
}

/*
 * Error category for client-facing error notifications.
 *
 * These are broad categories that help clients understand the nature of the error.
 */
enum SessionErrorCategory {
  // Unknown or unclassified error.
  ERROR_UNKNOWN = 0;

  // Session lifecycle errors (not initialized, already initialized).
  ERROR_SESSION = 1;

  // Configuration errors (invalid audio format, missing required fields).
  ERROR_CONFIGURATION = 2;

  // Protocol errors (malformed packets, unexpected message types).
  ERROR_PROTOCOL = 3;

  // Inference/AI processing errors (model unavailable, processing failed, timeout).
  ERROR_INFERENCE = 4;

  // Audio pipeline errors (codec failure, VAD errors).
  ERROR_AUDIO = 5;

  // TTS synthesis errors.
  ERROR_TTS = 6;

  // Internal service errors (catch-all for server-side issues).
  ERROR_INTERNAL = 7;
}

/*
 * Error notification sent to the client.
 *
 * This message is sent before the connection is closed to provide useful
 * context for debugging without exposing server internals.
 */
message SessionErrorNotification {
  // The error category for programmatic handling.
  SessionErrorCategory category = 1;

  // Human-readable message suitable for logging or display.
  string message = 2;

  // Optional trace ID for correlating with server logs.
  // Clients can provide this when reporting issues for easier debugging.
  optional string trace_id = 3;
}

/*
 * Notification that a user audio turn has been transcribed.
 *
 * Sent asynchronously after the transcription worker processes a user
 * audio turn. The turn_id identifies which conversation turn this
 * transcription belongs to.
 */
message UserTranscriptionResult {
  // Turn ID this transcription belongs to.
  uint32 turn_id = 1;

  // Transcribed text.
  string text = 2;

  // Detected language (ISO 639-1 code, e.g. "en").
  string language = 3;
}

/*
 * Notification sent to the client when the session is fully initialized
 * and ready to accept user input. Sent after TTS warmup completes.
 */
message SessionReady {}

/*
 * Notification that messages were truncated from the LLM's context window.
 *
 * Sent when the inference engine removes older messages to fit the token
 * budget. Only includes turns that were **newly** truncated in this inference
 * cycle - turns already reported in a previous ContextTruncated event are
 * not repeated.
 */
message ContextTruncated {
  /*
   * Turn IDs that were newly removed from the model's context window.
   * These correspond to `ChatMessage.turn_id` values in the conversation.
   * The model can no longer "see" these messages.
   */
  repeated uint32 truncated_turn_ids = 1;

  /*
   * The turn_id of the assistant response that was generated with the
   * truncated context. Correlates this truncation event with the
   * `ResponseBegin` / `ResponseEnd` for the same inference cycle.
   */
  uint32 response_turn_id = 2;
}

message ClientBoundMessage {
  oneof payload {
    ToolCallRequest tool_call_request = 1;
    ModelTextFragment model_text_fragment = 2;
    ModelAudioChunk model_audio_chunk = 3;
    PlaybackClearBuffer playback_clear_buffer = 4;
    ResponseBegin response_begin = 5;
    ResponseEnd response_end = 6;
    ChatHistory chat_history = 7;
    SessionErrorNotification error = 8;
    UserTranscriptionResult user_transcription_result = 9;
    ConversationQueryResult conversation_query_result = 10;
    SessionReady session_ready = 11;
    VadAnalysisFrame vad_analysis_frame = 12;
    VadStateEvent vad_state_event = 13;
    ContextTruncated context_truncated = 14;
  }
}