syntax = "proto3"; import "google/protobuf/struct.proto"; package eu.deepslate.realtime.speeq; /* * Enumeration of supported audio sample formats. */ enum SampleFormat { // 8-bit unsigned integer samples. UNSIGNED_8_BIT = 0; // 16-bit signed integer samples. SIGNED_16_BIT = 1; // 32-bit signed integer samples. SIGNED_32_BIT = 2; // 32-bit floating point samples (0.0 to 1.0). FLOAT_32_BIT = 3; // 64-bit floating point samples (0.0 to 1.0). FLOAT_64_BIT = 4; } /* * Definition of a tool that can be used by the model during inference. */ message ToolDefinition { // Name of the tool. // // This is what the model will use to refer to the tool when requesting its use. string name = 1; // Description of the tool's purpose and functionality. // // When not present, this will either be omitted or an empty string // depending on the inference engine's capabilities. optional string description = 2; // Parameters schema for the tool. // // This needs to be a valid JSON Schema object. Most inference engines will requires // the top level value to be of type "object". google.protobuf.Struct parameters = 3; } /* * Request message to update the tool definitions. * * This always completely replaces the existing tool definitions * with the provided ones, as in, it does not merge with existing definitions. * * An empty list of tool definitions will clear all existing tool definitions. */ message UpdateToolDefinitionsRequest { // List of tool definitions to use from now on. repeated ToolDefinition tool_definitions = 1; } /* * Request message to call a tool. * * Depending on the inference engine, you may get multiple requests to call tools * before getting a final response. * * Every request **must** be responded to with a corresponding ToolCallResponse, even * if tool execution fails or is not possible for some reason. */ message ToolCallRequest { // Identifier of the tool to call. // // This does **not** identify the tool, but the request to call a specific tool. string id = 1; // Name of the tool to call. string name = 2; // Parameters to pass to the tool. google.protobuf.Struct parameters = 3; } /* * Response message for a tool call. */ message ToolCallResponse { // Identifier of the tool call request this response corresponds to. string id = 1; // Result of the tool call. // // This _can_ be JSON, but it doesn't have to be. As long as the model is able to // understand the result, any format is acceptable. string result = 2; } /* * Duration message representing a span of time. */ message Duration { // Whole seconds of the duration. uint64 seconds = 1; // Nanoseconds component of the duration. uint32 nanos = 2; } /* * Configuration for an audio line (input or output depending on context). * * The service always assumes that audio is raw PCM with 16-bit samples. */ message AudioLineConfiguration { // Sample rate in Hz. uint32 sample_rate = 1; // Number of audio channels. uint32 channel_count = 2; // Sample format. SampleFormat sample_format = 3; } message VadConfiguration { // Minimum confidence required to consider audio as speech (0.0 to 1.0). float confidence_threshold = 1; // Minimum volume level to consider audio as speech (0.0 to 1.0). float min_volume = 2; // Duration of speech to detect start of speech. Duration start_duration = 3; // Duration of silence to detect end of speech. Duration stop_duration = 4; // Duration of audio to buffer before speech detection. // This allows capturing audio from before speech starts. // // Recommended duration is 1 second. Duration backbuffer_duration = 5; } /* * Configuration for inference engine. */ message InferenceConfiguration { // System prompt to guide the model's behavior. string system_prompt = 1; } /* * Configuration for ElevenLabs TTS provider. */ message ElevenLabsTtsConfiguration { // ElevenLabs API key for this session string api_key = 1; // Voice ID to use (e.g., "21m00Tcm4TlvDq8ikWAM" for Rachel) string voice_id = 2; // Model ID (e.g., "eleven_turbo_v2"). If not set, uses default. optional string model_id = 3; } /* * Configuration for text-to-speech output. * * If not specified, raw text fragments are sent to the client. */ message TtsConfiguration { oneof provider { ElevenLabsTtsConfiguration eleven_labs = 1; } } /* * First message sent to initialize a session. * * This message must always be the first message sent from the client to the service */ message InitializeSessionRequest { // Configuration for the input audio line. AudioLineConfiguration input_audio_line = 1; // Configuration for the output audio line. AudioLineConfiguration output_audio_line = 2; // VAD configuration for speech detection. VadConfiguration vad_configuration = 3; // Inference engine configuration. InferenceConfiguration inference_configuration = 4; // TTS configuration. If not specified, raw text fragments are sent to the client. optional TtsConfiguration tts_configuration = 5; } /* * Request to reconfigure an ongoing session. * * This may be useful to change audio input settings on the fly. * Note that corvidae does not guarantee that this happens in a * seamless manner, there may be glitches or dropped audio. */ message ReconfigureSessionRequest { optional AudioLineConfiguration input_audio_line = 1; } /* * Enumeration for inference trigger modes. */ enum InferenceTriggerMode { // Don't trigger inference automatically. NO_TRIGGER = 0; // Queue inference to happen as soon as current inference is // done (or immediately if no inference is ongoing). QUEUE = 1; // Interrupt current inference and start new inference immediately. IMMEDIATE = 2; } /* * Audio data wrapper. */ message AudioData { // Raw audio data bytes. Must confirm to the AudioLineConfiguration for the respective line. bytes data = 1; } /* * Text data wrapper. */ message TextData { // Raw text data. string data = 1; } message UserInput { // Packet identifier for tracking purposes. // // Corvidae doesn't care too much about this value, but it will use it to // refer to specific packets in responses or events. The client is free // to choose any numbering scheme, including non-sequential or random values. uint64 packet_id = 1; // Inference trigger mode for this input. InferenceTriggerMode mode = 2; oneof input { // Audio data input. AudioData audio_data = 3; // Text data input. TextData text_data = 4; } } /* * Message to trigger inference processing manually and right now. * * This instructs the inference engine to take the current input and * process it immediately, instead of waiting for natural pauses or * end-of-input signals. * * Words of caution: This is useful for generating a greeting or similar * immediate response, but model behavior may be unpredictable if used * directly after a model response. Corvidae will do its best to patch * things together, but there are no guarantees. Use for generating a * greeting is the primary intended use case and fully supported, other * use cases may or may not work as expected. */ message TriggerInference { // Optional extra instructions to guide the inference. // // This can be used to provide context or specific directives // to the model for this inference trigger. optional string extra_instructions = 1; } /* * Message sent from the client to the service. */ message ServiceBoundMessage { oneof payload { InitializeSessionRequest initialize_session_request = 1; ReconfigureSessionRequest reconfigure_session_request = 2; UserInput user_input = 3; UpdateToolDefinitionsRequest update_tool_definitions_request = 4; ToolCallResponse tool_call_response = 5; TriggerInference trigger_inference = 6; } } /* * A fragment of text from the model, streamed as tokens arrive. */ message ModelTextFragment { // The text content of this fragment. string text = 1; } /* * An audio chunk from the model (TTS output). */ message ModelAudioChunk { // The audio data AudioData audio = 1; // Optional: text that was spoken (alignment data from TTS provider) optional string transcript = 2; } /* * Notification to clear the playback buffer. * * This is sent when the user starts speaking, regardless of whether * there is ongoing TTS playback or not. In other words, the event is * sent pro-actively to ensure that any ongoing playback is stopped. * * This might be sent multiple times if the user interrupts multiple times. */ message PlaybackClearBuffer {} /* * Notification that the model has begun its response. */ message ResponseBegin {} /* * Notification that the model has finished its response. */ message ResponseEnd {} message ClientBoundMessage { oneof payload { ToolCallRequest tool_call_request = 1; ModelTextFragment model_text_fragment = 2; ModelAudioChunk model_audio_chunk = 3; PlaybackClearBuffer playback_clear_buffer = 4; ResponseBegin response_begin = 5; ResponseEnd response_end = 6; } }