syntax = "proto3";

import "google/protobuf/struct.proto";

package eu.deepslate.realtime.speeq;

/*
 * Enumeration of supported audio sample formats.
 */
enum SampleFormat {
  // 8-bit unsigned integer samples.
  UNSIGNED_8_BIT = 0;

  // 16-bit signed integer samples.
  SIGNED_16_BIT = 1;

  // 32-bit signed integer samples.
  SIGNED_32_BIT = 2;

  // 32-bit floating point samples (0.0 to 1.0).
  FLOAT_32_BIT = 3;

  // 64-bit floating point samples (0.0 to 1.0).
  FLOAT_64_BIT = 4;
}

/*
 * Definition of a tool that can be used by the model during inference.
 */
message ToolDefinition {
  // Name of the tool.
  //
  // This is what the model will use to refer to the tool when requesting its use.
  string name = 1;

  // Description of the tool's purpose and functionality.
  //
  // When not present, this will either be omitted or an empty string
  // depending on the inference engine's capabilities.
  optional string description = 2;

  // Parameters schema for the tool.
  //
  // This needs to be a valid JSON Schema object. Most inference engines will requires
  // the top level value to be of type "object".
  google.protobuf.Struct parameters = 3;
}

/*
 * Request message to update the tool definitions.
 *
 * This always completely replaces the existing tool definitions
 * with the provided ones, as in, it does not merge with existing definitions.
 *
 * An empty list of tool definitions will clear all existing tool definitions.
 */
message UpdateToolDefinitionsRequest {
  // List of tool definitions to use from now on.
  repeated ToolDefinition tool_definitions = 1;
}

/*
 * Request message to call a tool.
 *
 * Depending on the inference engine, you may get multiple requests to call tools
 * before getting a final response.
 *
 * Every request **must** be responded to with a corresponding ToolCallResponse, even
 * if tool execution fails or is not possible for some reason.
 */
message ToolCallRequest {
  // Identifier of the tool to call.
  //
  // This does **not** identify the tool, but the request to call a specific tool.
  string id = 1;

  // Name of the tool to call.
  string name = 2;

  // Parameters to pass to the tool.
  google.protobuf.Struct parameters = 3;
}

/*
 * Response message for a tool call.
 */
message ToolCallResponse {
  // Identifier of the tool call request this response corresponds to.
  string id = 1;

  // Result of the tool call.
  //
  // This _can_ be JSON, but it doesn't have to be. As long as the model is able to
  // understand the result, any format is acceptable.
  string result = 2;
}

/*
 * Duration message representing a span of time.
 */
message Duration {
  // Whole seconds of the duration.
  uint64 seconds = 1;

  // Nanoseconds component of the duration.
  uint32 nanos = 2;
}

/*
 * Configuration for an audio line (input or output depending on context).
 *
 * The service always assumes that audio is raw PCM with 16-bit samples.
 */
message AudioLineConfiguration {
  // Sample rate in Hz.
  uint32 sample_rate = 1;

  // Number of audio channels.
  uint32 channel_count = 2;

  // Sample format.
  SampleFormat sample_format = 3;
}

message VadConfiguration {
  // Minimum confidence required to consider audio as speech (0.0 to 1.0).
  float confidence_threshold = 1;

  // Minimum volume level to consider audio as speech (0.0 to 1.0).
  float min_volume = 2;

  // Duration of speech to detect start of speech.
  Duration start_duration = 3;

  // Duration of silence to detect end of speech.
  Duration stop_duration = 4;

  // Duration of audio to buffer before speech detection.
  // This allows capturing audio from before speech starts.
  //
  // Recommended duration is 1 second.
  Duration backbuffer_duration = 5;
}

/*
 * Configuration for inference engine.
 */
message InferenceConfiguration {
  // System prompt to guide the model's behavior.
  string system_prompt = 1;
}

/*
 * Configuration for ElevenLabs TTS provider.
 */
message ElevenLabsTtsConfiguration {
  // ElevenLabs API key for this session
  string api_key = 1;

  // Voice ID to use (e.g., "21m00Tcm4TlvDq8ikWAM" for Rachel)
  string voice_id = 2;

  // Model ID (e.g., "eleven_turbo_v2"). If not set, uses default.
  optional string model_id = 3;
}

/*
 * Configuration for text-to-speech output.
 *
 * If not specified, raw text fragments are sent to the client.
 */
message TtsConfiguration {
  oneof provider {
    ElevenLabsTtsConfiguration eleven_labs = 1;
  }
}

/*
 * First message sent to initialize a session.
 *
 * This message must always be the first message sent from the client to the service
 */
message InitializeSessionRequest {
  // Configuration for the input audio line.
  AudioLineConfiguration input_audio_line = 1;

  // Configuration for the output audio line.
  AudioLineConfiguration output_audio_line = 2;

  // VAD configuration for speech detection.
  VadConfiguration vad_configuration = 3;

  // Inference engine configuration.
  InferenceConfiguration inference_configuration = 4;

  // TTS configuration. If not specified, raw text fragments are sent to the client.
  optional TtsConfiguration tts_configuration = 5;
}

/*
 * Request to reconfigure an ongoing session.
 *
 * This may be useful to change audio input settings on the fly.
 * Note that corvidae does not guarantee that this happens in a
 * seamless manner, there may be glitches or dropped audio.
 */
message ReconfigureSessionRequest {
  optional AudioLineConfiguration input_audio_line = 1;
}

/*
 * Enumeration for inference trigger modes.
 */
enum InferenceTriggerMode {
  // Don't trigger inference automatically.
  NO_TRIGGER = 0;

  // Queue inference to happen as soon as current inference is
  // done (or immediately if no inference is ongoing).
  QUEUE = 1;

  // Interrupt current inference and start new inference immediately.
  IMMEDIATE = 2;
}

/*
 * Audio data wrapper.
 */
message AudioData {
  // Raw audio data bytes. Must confirm to the AudioLineConfiguration for the respective line.
  bytes data = 1;
}

/*
 * Text data wrapper.
 */
message TextData {
  // Raw text data.
  string data = 1;
}

message UserInput {
  // Packet identifier for tracking purposes.
  //
  // Corvidae doesn't care too much about this value, but it will use it to
  // refer to specific packets in responses or events. The client is free
  // to choose any numbering scheme, including non-sequential or random values.
  uint64 packet_id = 1;

  // Inference trigger mode for this input.
  InferenceTriggerMode mode = 2;

  oneof input {
    // Audio data input.
    AudioData audio_data = 3;

    // Text data input.
    TextData text_data = 4;
  }
}

/*
 * Message to trigger inference processing manually and right now.
 *
 * This instructs the inference engine to take the current input and
 * process it immediately, instead of waiting for natural pauses or
 * end-of-input signals.
 *
 * Words of caution: This is useful for generating a greeting or similar
 * immediate response, but model behavior may be unpredictable if used
 * directly after a model response. Corvidae will do its best to patch
 * things together, but there are no guarantees. Use for generating a
 * greeting is the primary intended use case and fully supported, other
 * use cases may or may not work as expected.
 */
message TriggerInference {
  // Optional extra instructions to guide the inference.
  //
  // This can be used to provide context or specific directives
  // to the model for this inference trigger.
  optional string extra_instructions = 1;
}

/*
 * Message sent from the client to the service.
 */
message ServiceBoundMessage {
  oneof payload {
    InitializeSessionRequest initialize_session_request = 1;
    ReconfigureSessionRequest reconfigure_session_request = 2;
    UserInput user_input = 3;
    UpdateToolDefinitionsRequest update_tool_definitions_request = 4;
    ToolCallResponse tool_call_response = 5;
    TriggerInference trigger_inference = 6;
  }
}

/*
 * A fragment of text from the model, streamed as tokens arrive.
 */
message ModelTextFragment {
  // The text content of this fragment.
  string text = 1;
}

/*
 * An audio chunk from the model (TTS output).
 */
message ModelAudioChunk {
  // The audio data
  AudioData audio = 1;

  // Optional: text that was spoken (alignment data from TTS provider)
  optional string transcript = 2;
}

/*
 * Notification to clear the playback buffer.
 *
 * This is sent when the user starts speaking, regardless of whether
 * there is ongoing TTS playback or not. In other words, the event is
 * sent pro-actively to ensure that any ongoing playback is stopped.
 *
 * This might be sent multiple times if the user interrupts multiple times.
 */
message PlaybackClearBuffer {}

/*
 * Notification that the model has begun its response.
 */
message ResponseBegin {}

/*
 * Notification that the model has finished its response.
 */
message ResponseEnd {}

message ClientBoundMessage {
  oneof payload {
    ToolCallRequest tool_call_request = 1;
    ModelTextFragment model_text_fragment = 2;
    ModelAudioChunk model_audio_chunk = 3;
    PlaybackClearBuffer playback_clear_buffer = 4;
    ResponseBegin response_begin = 5;
    ResponseEnd response_end = 6;
  }
}