{ "openapi": "3.1.0", "info": { "title": "Speech-AI Forge API", "description": "\n🍦 Speech-AI-Forge 是一个围绕 TTS 生成模型 ChatTTS 开发的项目,实现了 API Server 和 基于 Gradio 的 WebUI。
\n🍦 Speech-AI-Forge is a project developed around the TTS generation model ChatTTS, implementing an API Server and a Gradio-based WebUI.\n\n项目地址: [https://github.com/lenML/Speech-AI-Forge](https://github.com/lenML/Speech-AI-Forge)\n\n> 所有生成音频的 POST api都无法在此页面调试,调试建议使用 playground
\n> All audio generation POST APIs cannot be debugged on this page, it is recommended to use playground for debugging\n\n> 如果你不熟悉本系统,建议从这个一键脚本开始,在colab中尝试一下:
\n> [https://colab.research.google.com/github/lenML/Speech-AI-Forge/blob/main/colab.ipynb](https://colab.research.google.com/github/lenML/Speech-AI-Forge/blob/main/colab.ipynb)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)\n ", "version": "0.1.0" }, "paths": { "/v1/ping": { "get": { "tags": [ "System" ], "summary": "Ping", "description": "Health check", "operationId": "ping_v1_ping_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/versions": { "get": { "tags": [ "System" ], "summary": "Get Versions", "description": "Get versions", "operationId": "get_versions_v1_versions_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/audio_formats": { "get": { "tags": [ "System" ], "summary": "Get Audio Formats", "description": "Get audio encoder formats", "operationId": "get_audio_formats_v1_audio_formats_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/models/reload": { "get": { "tags": [ "Models" ], "summary": "Reload Models", "operationId": "reload_models_v1_models_reload_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/models/unload": { "get": { "tags": [ "Models" ], "summary": "Unload Models", "operationId": "unload_models_v1_models_unload_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/models/list": { "get": { "tags": [ "Models" ], "summary": "Unload Models", "operationId": "unload_models_v1_models_list_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/styles/list": { "get": { "tags": [ "Style" ], "summary": "List Styles", "description": "**DEPRECATED**\nThis API is deprecated and will be removed in the future. We will replace it with the speaker system and prompt system.", "operationId": "list_styles_v1_styles_list_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/speakers/list": { "get": { "tags": [ "Speaker" ], "summary": "List Speakers", "description": "List all available speakers with optional pagination and detail control.\n\n- `detailed`: If true, returns complete metadata including references and embeddings.\n- `offset` / `limit`: Support for paginated speaker listing.", "operationId": "list_speakers_v1_speakers_list_get", "parameters": [ { "name": "detailed", "in": "query", "required": false, "schema": { "type": "boolean", "default": false, "title": "Detailed" } }, { "name": "offset", "in": "query", "required": false, "schema": { "type": "integer", "default": 0, "title": "Offset" } }, { "name": "limit", "in": "query", "required": false, "schema": { "type": "integer", "default": 5, "title": "Limit" } } ], "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/speakers/refresh": { "post": { "tags": [ "Speaker" ], "summary": "Refresh Speakers", "description": "Force reload of all speaker metadata from disk. \nUse this when files are modified externally or newly added.", "operationId": "refresh_speakers_v1_speakers_refresh_post", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } } } } }, "/v1/speakers/update": { "post": { "tags": [ "Speaker" ], "summary": "Update Speakers", "description": "Batch update multiple speakers by providing a list of speaker JSON configs.\n\nEach speaker must already exist (matched by ID). \nWill overwrite corresponding fields and persist changes to disk.", "operationId": "update_speakers_v1_speakers_update_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/SpeakersUpdate" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/speaker/create": { "post": { "tags": [ "Speaker" ], "summary": "Create Speaker", "description": "Create a new speaker profile with optional reference audios.\n\n- `name` is required and used as unique identifier.\n- `wavs` is a list of audio samples (base64-encoded) and reference texts for embedding.\n- `save_file`: If true, the speaker will be saved to disk and available after refresh.", "operationId": "create_speaker_v1_speaker_create_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CreateSpeaker" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/speaker/update": { "post": { "tags": [ "Speaker" ], "summary": "Update Speaker", "description": "Update a single speaker's configuration by full JSON override.\n\nThe speaker must already exist (matched by ID). \nFields like name, gender, refs, etc., will be updated accordingly.", "operationId": "update_speaker_v1_speaker_update_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/UpdateSpeaker" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/speaker/detail": { "post": { "tags": [ "Speaker" ], "summary": "Speaker Detail", "description": "Fetch metadata of a specific speaker by ID.\n\n- `with_emb`: If true, includes embedding vectors and all reference data.", "operationId": "speaker_detail_v1_speaker_detail_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/SpeakerDetail" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/tts": { "get": { "tags": [ "TTS" ], "summary": "Synthesize Tts", "description": "**Text-to-Speech Synthesis API (v1 - GET)**\n\nThis endpoint converts text into speech using GET request parameters.\nIt offers various options for customizing the voice, style, audio output, and processing.\n\n**Mandatory Parameter**:\n* `text`: The text string to be synthesized.\n\n**Speaker and Style Customization**:\n* `spk`: Specify the speaker by name or seed (e.g., \"female2\").\n* `style`: Define the speaking style (e.g., \"chat\").\n\n**Generation Control**:\n* `temperature`: Sampling temperature (0.0-1.0, e.g., 0.3). Controls randomness.\n* `top_p`: Nucleus sampling probability (0.0-1.0, e.g., 0.5).\n* `top_k`: Limits sampling to the K most likely next tokens (e.g., 20).\n* `seed`: Seed for reproducible generation (e.g., 42).\n* `prompt`, `prompt1`, `prompt2`: Optional text prompts to guide inference.\n* `prefix`: Optional text prefix for inference.\n* `bs` (batch_size): Batch size for processing (e.g., \"8\").\n* `thr` (threshold): Sentence splitter threshold (e.g., \"100\").\n* `eos`: End-of-sentence marker (e.g., \"[uv_break]\").\n\n**Output Audio Configuration**:\n* `format`: Desired audio output format. Supported: \"mp3\", \"wav\", \"raw\" (default).\n* `bitrate`: Audio bitrate for compressed formats (e.g., \"64k\").\n\n**Audio Adjustments**:\n* `speed`: Playback speed multiplier (e.g., 1.0 for normal).\n* `pitch`: Pitch adjustment (e.g., 0 for no change).\n* `volume_gain`: Volume gain in dB (e.g., 0 for no change).\n\n**Enhancements**:\n* `enhance`: Boolean to enable audio enhancement (default: false).\n* `denoise`: Boolean to enable audio denoising (default: false).\n\n**Streaming Output**:\n* `stream`: Boolean to enable streaming audio generation (default: false).\n* `chunk_size`: Size of chunks for streaming (e.g., 64, if stream is true).\n\n**Caching**:\n* `no_cache`: Boolean or \"on\"/\"off\" to disable caching (default: false).\n\n**Model Selection**:\n* `model`: Specify the TTS model ID to use (e.g., \"chat-tts\", \"cosy-voice\").\n\n**Response**:\n* **Success**: An audio file stream (`FileResponse`).\n* **Failure**: A JSON object detailing the error (e.g., validation errors, internal server error).\n\n**Note**: This v1 endpoint does *not* support voice cloning via reference audio. For voice cloning, please refer to the v2 API. Parameters like temperature, top_p, top_k, seed, prompts, and prefix might be overridden by specific speaker (`spk`) or style (`style`) configurations.", "operationId": "synthesize_tts_v1_tts_get", "parameters": [ { "name": "text", "in": "query", "required": true, "schema": { "type": "string", "title": "Text" } }, { "name": "spk", "in": "query", "required": false, "schema": { "type": "string", "default": "female2", "title": "Spk" } }, { "name": "style", "in": "query", "required": false, "schema": { "type": "string", "default": "chat", "title": "Style" } }, { "name": "temperature", "in": "query", "required": false, "schema": { "type": "number", "default": 0.3, "title": "Temperature" } }, { "name": "top_p", "in": "query", "required": false, "schema": { "type": "number", "default": 0.5, "title": "Top P" } }, { "name": "top_k", "in": "query", "required": false, "schema": { "type": "integer", "default": 20, "title": "Top K" } }, { "name": "seed", "in": "query", "required": false, "schema": { "type": "integer", "default": 42, "title": "Seed" } }, { "name": "format", "in": "query", "required": false, "schema": { "type": "string", "default": "raw", "title": "Format" } }, { "name": "bitrate", "in": "query", "required": false, "schema": { "type": "string", "default": "64k", "title": "Bitrate" } }, { "name": "prompt", "in": "query", "required": false, "schema": { "type": "string", "default": "", "title": "Prompt" } }, { "name": "prompt1", "in": "query", "required": false, "schema": { "type": "string", "default": "", "title": "Prompt1" } }, { "name": "prompt2", "in": "query", "required": false, "schema": { "type": "string", "default": "", "title": "Prompt2" } }, { "name": "prefix", "in": "query", "required": false, "schema": { "type": "string", "default": "", "title": "Prefix" } }, { "name": "bs", "in": "query", "required": false, "schema": { "type": "string", "default": "8", "title": "Bs" } }, { "name": "thr", "in": "query", "required": false, "schema": { "type": "string", "default": "100", "title": "Thr" } }, { "name": "eos", "in": "query", "required": false, "schema": { "type": "string", "default": "[uv_break]", "title": "Eos" } }, { "name": "enhance", "in": "query", "required": false, "schema": { "type": "boolean", "default": false, "title": "Enhance" } }, { "name": "denoise", "in": "query", "required": false, "schema": { "type": "boolean", "default": false, "title": "Denoise" } }, { "name": "speed", "in": "query", "required": false, "schema": { "type": "number", "default": 1.0, "title": "Speed" } }, { "name": "pitch", "in": "query", "required": false, "schema": { "type": "number", "default": 0, "title": "Pitch" } }, { "name": "volume_gain", "in": "query", "required": false, "schema": { "type": "number", "default": 0, "title": "Volume Gain" } }, { "name": "stream", "in": "query", "required": false, "schema": { "type": "boolean", "default": false, "title": "Stream" } }, { "name": "chunk_size", "in": "query", "required": false, "schema": { "type": "integer", "default": 64, "title": "Chunk Size" } }, { "name": "no_cache", "in": "query", "required": false, "schema": { "anyOf": [ { "type": "boolean" }, { "enum": [ "on", "off" ], "type": "string" } ], "default": false, "title": "No Cache" } }, { "name": "model", "in": "query", "required": false, "schema": { "type": "string", "default": "chat-tts", "title": "Model" } } ], "responses": { "200": { "description": "Successful Response" }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/ssml": { "post": { "tags": [ "SSML" ], "summary": "Synthesize Ssml Api", "description": "Synthesize speech from SSML-formatted input using a specified TTS model.\n\nThis endpoint supports multi-speaker, multi-style speech synthesis based on structured SSML input.\nIt can return audio in various formats (e.g., `raw`, `wav`, `mp3`), with optional enhancements and prosody adjustments.\n\n### Supported Features\n- Multiple speakers via ``\n- Arbitrary text segmentation with sentence break markers (e.g., `eos=\"[uv_break]\"`)\n- Streaming or full-response audio\n- Audio enhancement & pitch/speed adjustment via `EnhancerConfig` / `AdjustConfig`\n- Custom batch size & segment length control via `batch_size` / `spliter_thr`\n\n### Parameters\n- `ssml` (str): SSML XML string containing structured speech content (required)\n- `format` (str): Output audio format. One of: `raw`, `wav`, `mp3` (default: `raw`)\n- `batch_size` (int): Batch size for internal TTS inference, must be > 0\n- `eos` (str): End-of-sentence token for segmentation (default: `[uv_break]`)\n- `model` (str): TTS model identifier to be used (default: `chat-tts`)\n- `spliter_thr` (int): Threshold to split long texts (default: 100, minimum: 50)\n- `enhancer` (EnhancerConfig): Optional audio enhancer settings\n- `adjuster` (AdjustConfig): Optional pitch/speed/volume control\n- `stream` (bool): If true, returns a streaming response; otherwise, file response\n\n### Example SSML Input\n\n```xml\n\n ChatTTS 用于合成多角色多情感的有声书示例\n 黛玉冷笑道:\n 我说呢,亏了绊住,不然,早就飞起来了。\n 宝玉道:\n “只许和你玩,替你解闷。不过偶然到他那里,就说这些闲话。”\n “好没意思的话! 去不去,关我什么事儿? 又没叫你替我解闷儿 ,还许你不理我呢”\n 说着,便赌气回房去了。\n\n````\n\nThe endpoint returns a synthesized audio file or stream based on the provided SSML and configuration.", "operationId": "synthesize_ssml_api_v1_ssml_post", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { "$ref": "#/components/schemas/SSMLParams" } ], "title": "Params", "description": "JSON body with SSML string and format" } } }, "required": true }, "responses": { "200": { "description": "Successful Response" }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/text:synthesize": { "post": { "tags": [ "Google API" ], "summary": "Google Text Synthesize", "description": "google api document:
\n[https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize)\n\n- 多个属性在本系统中无用仅仅是为了兼容google api\n- voice 中的 topP, topK, temperature 为本系统中的参数\n- voice.name 即 speaker name (或者speaker seed)\n- voice.seed 为 infer seed (可在webui中测试具体作用)\n\n- 编码格式影响的是 audioContent 的二进制格式,所以所有format都是返回带有base64数据的json", "operationId": "google_text_synthesize_v1_text_synthesize_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GoogleTextSynthesizeParams" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GoogleTextSynthesizeResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/speech:recognize": { "post": { "tags": [ "Google API" ], "summary": "Speech Recognize", "description": "Performs synchronous speech recognition: receive results after all audio has been sent and processed.", "operationId": "speech_recognize_v1_speech_recognize_post", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } } } } }, "/v1/speech:longrunningrecognize": { "post": { "tags": [ "Google API" ], "summary": "Long Running Recognize", "description": "Performs asynchronous speech recognition: receive results via the google.longrunning.Operations interface.", "operationId": "long_running_recognize_v1_speech_longrunningrecognize_post", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } } } } }, "/v1/audio/speech": { "post": { "tags": [ "OpenAI API" ], "summary": "Openai Speech Api", "description": "openai api document: \n[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)\n\n以下属性为本系统自定义属性,不在openai文档中:\n- batch_size: 是否开启batch合成,小于等于1表示不使用batch (不推荐)\n- spliter_threshold: 开启batch合成时,句子分割的阈值\n- style: 风格\n\n> model 可填任意值", "operationId": "openai_speech_api_v1_audio_speech_post", "requestBody": { "content": { "application/json": { "schema": { "allOf": [ { "$ref": "#/components/schemas/AudioSpeechParams" } ], "title": "Params", "description": "JSON body with model, input text, and voice" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/audio/transcriptions": { "post": { "tags": [ "OpenAI API" ], "summary": "Transcribe", "description": "Transcribes audio into the input language.", "operationId": "transcribe_v1_audio_transcriptions_post", "requestBody": { "content": { "multipart/form-data": { "schema": { "$ref": "#/components/schemas/Body_transcribe_v1_audio_transcriptions_post" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/prompt/refine": { "post": { "tags": [ "Text" ], "summary": "Refiner Prompt Post", "description": "**DeprecationWarning**\n\nThis endpoint is deprecated and will be removed in the future.\n\nRequirements:\n- `chattts` model", "operationId": "refiner_prompt_post_v1_prompt_refine_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/RefineTextRequest" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/text/normalize": { "post": { "tags": [ "Text" ], "summary": "Text Normalize Post", "description": "Normalize raw input text using a selected Text Normalization (TN) pipeline.\n\nThis endpoint supports different TN implementations to perform text normalization \n(e.g., expanding numbers, abbreviations, adding pauses or prosodic markers for TTS, etc.).\n\n### Parameters\n\n- `text` (str): The raw text to normalize.\n- `pipe_id` (str): The TN pipeline to use. Available options:\n - `base`\n - `chat-tts`\n - `cosy-voice`\n - `fish-speech`\n - `f5-tts`\n - `index-tts`\n - `spark-tts`\n- `config` (TNConfig, optional): Optional configuration to customize TN behavior for specific pipelines.\n\n### Returns\n\nA normalized version of the input text, suitable for use in speech synthesis or downstream NLP tasks.", "operationId": "text_normalize_post_v1_text_normalize_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TextNormalizeRequest" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/BaseResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/xtts_v2/speakers": { "get": { "tags": [ "XTTS" ], "summary": "Speakers", "description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)", "operationId": "speakers_v1_xtts_v2_speakers_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } } } } }, "/v1/xtts_v2/tts_to_audio": { "post": { "tags": [ "XTTS" ], "summary": "Tts To Audio", "description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)", "operationId": "tts_to_audio_v1_xtts_v2_tts_to_audio_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/SynthesisParams" } } }, "required": true }, "responses": { "200": { "description": "Successful Response" }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/xtts_v2/tts_stream": { "get": { "tags": [ "XTTS" ], "summary": "Tts Stream", "description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)", "operationId": "tts_stream_v1_xtts_v2_tts_stream_get", "parameters": [ { "name": "text", "in": "query", "required": true, "schema": { "type": "string", "title": "Text" } }, { "name": "speaker_wav", "in": "query", "required": true, "schema": { "type": "string", "title": "Speaker Wav" } }, { "name": "language", "in": "query", "required": false, "schema": { "type": "string", "default": "cn", "title": "Language" } }, { "name": "no_cache", "in": "query", "required": false, "schema": { "type": "boolean", "default": false, "title": "No Cache" } } ], "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/xtts_v2/set_tts_settings": { "post": { "tags": [ "XTTS" ], "summary": "Set Tts Settings", "description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)", "operationId": "set_tts_settings_v1_xtts_v2_set_tts_settings_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TTSSettingsRequest" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/stt/transcribe": { "post": { "tags": [ "STT" ], "summary": "Transcribe", "description": "Transcribes audio into the input language.", "operationId": "transcribe_v1_stt_transcribe_post", "requestBody": { "content": { "multipart/form-data": { "schema": { "$ref": "#/components/schemas/Body_transcribe_v1_stt_transcribe_post" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TranscriptionsResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/stt/stream": { "post": { "tags": [ "STT" ], "summary": "Transcribe Stream", "description": "Transcribes audio into the input language in real-time.\n\n* Not implemented yet (WIP)", "operationId": "transcribe_stream_v1_stt_stream_post", "requestBody": { "content": { "multipart/form-data": { "schema": { "$ref": "#/components/schemas/Body_transcribe_stream_v1_stt_stream_post" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": {} } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v1/vc": { "post": { "tags": [ "Voice Clone" ], "summary": "Voice Clone", "description": "Voice cloning API\n\n**Deprecated**\nThis API is deprecated and will be removed in the future.\nPlease use the `TTS API` instead.", "operationId": "voice_clone_v1_vc_post", "parameters": [ { "name": "ref_spk", "in": "query", "required": false, "schema": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Ref Spk" } }, { "name": "spk_emotion", "in": "query", "required": false, "schema": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Spk Emotion" } }, { "name": "model", "in": "query", "required": false, "schema": { "type": "string", "default": "open-voice", "title": "Model" } }, { "name": "tau", "in": "query", "required": false, "schema": { "type": "number", "default": 0.3, "title": "Tau" } }, { "name": "format", "in": "query", "required": false, "schema": { "type": "string", "default": "mp3", "title": "Format" } } ], "requestBody": { "required": true, "content": { "multipart/form-data": { "schema": { "$ref": "#/components/schemas/Body_voice_clone_v1_vc_post" } } } }, "responses": { "200": { "description": "Successful Response" }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v2/tts": { "post": { "tags": [ "Forge V2" ], "summary": "Forge Text Synthesize", "description": "**Text-to-Speech Synthesis API (v2)**\n\nThis endpoint converts text into speech, offering a wide range of customization options.\nIt accepts JSON formatted data in the request body.\n\n**Core Functionality**:\n* Synthesizes speech from various text inputs.\n* Supports voice cloning by directly uploading a reference audio.\n\n**Input Flexibility**:\n* Provide text via one of these mutually exclusive fields:\n * `text`: A single string of text.\n * `texts`: A list of strings for batch processing.\n * `ssml`: Text formatted using Speech Synthesis Markup Language (SSML).\n\n**Speaker Customization (`spk`)**:\n* Use pre-defined speakers via `spk.from_spk_id` or `spk.from_spk_name`.\n* **Voice Cloning**: Provide reference audio for voice cloning:\n * `spk.from_ref.wav_b64`: Base64 encoded WAV audio data.\n * `spk.from_ref.text`: The transcript corresponding to the reference audio.\n\n**Audio Processing & Control**:\n* **Adjustments (`adjust`)**:\n * `pitch`: Modify audio pitch.\n * `speed_rate`: Adjust speaking rate.\n * `volume_gain_db`: Change audio volume.\n * `normalize`: Apply volume normalization.\n * `remove_silence`: Trim silence from audio ends.\n* **Encoding (`encoder`)**:\n * `format`: Output audio format (e.g., \"mp3\", \"wav\").\n * `bitrate`: Audio bitrate (e.g., \"64k\").\n * `acodec`: Audio codec (e.g., \"libmp3lame\").\n* **Enhancement (`enhance`)**:\n * `enabled`: Toggle audio enhancement.\n * `model`: Select enhancement model.\n * Additional parameters like `nfe`, `solver`, `lambd`, `tau` for fine-tuning.\n* **Inference (`infer`)**:\n * `batch_size`, `spliter_threshold`, `eos` for text processing.\n * `seed`: For reproducible outputs.\n * `stream`, `stream_chunk_size`: Enable and configure streaming output.\n * `no_cache`, `sync_gen`: Control caching and generation mode.\n* **Text Normalization (`tn`)**:\n * `enabled`/`disabled`: Specify text normalization rules.\n* **TTS Model Parameters (`tts`)**:\n * `mid`: TTS model ID (e.g., \"cosy-voice\").\n * `style`, `emotion`: Control speaking style and emotion.\n * `temperature`, `top_p`, `top_k`: Adjust sampling parameters for generation.\n * Deprecated: `prompt`, `prompt1`, `prompt2`, `prefix`.\n* **Voice Cloning (`vc`)** (Note: Potentially deprecated, mainly for OpenVoice):\n * `enabled`, `mid`, `emotion`, `tau`.\n\n**Response**:\n* **Success**: An audio file stream (`FileResponse`).\n* **Failure**: A JSON object detailing the error.\n\n**Key Considerations**:\n1. Ensure only one of `text`, `texts`, or `ssml` is provided.\n2. For voice cloning, use high-quality reference audio (3-5 seconds recommended).\n3. Enabling audio enhancement may increase generation latency.", "operationId": "forge_text_synthesize_v2_tts_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/V2TtsParams" } } }, "required": true }, "responses": { "200": { "description": "Successful Response" }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } }, "/v2/stt": { "post": { "tags": [ "Forge V2" ], "summary": "Transcribe V2", "description": "Transcribes audio using base64 or URL input. Stateless interface.", "operationId": "transcribe_v2_v2_stt_post", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TranscriptionRequest" } } }, "required": true }, "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TranscriptionsResponse" } } } }, "422": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HTTPValidationError" } } } } } } } }, "components": { "schemas": { "AdjustConfig": { "properties": { "pitch": { "type": "number", "title": "Pitch", "default": 0 }, "speed_rate": { "type": "number", "title": "Speed Rate", "default": 1 }, "volume_gain_db": { "type": "number", "title": "Volume Gain Db", "default": 0 }, "normalize": { "type": "boolean", "title": "Normalize", "default": true }, "headroom": { "type": "number", "title": "Headroom", "default": 1 }, "remove_silence": { "type": "boolean", "title": "Remove Silence", "default": false }, "remove_silence_threshold": { "type": "number", "title": "Remove Silence Threshold", "default": -42 } }, "type": "object", "title": "AdjustConfig" }, "AudioConfig": { "properties": { "audioEncoding": { "allOf": [ { "$ref": "#/components/schemas/AudioFormat" } ], "default": "raw" }, "audioBitrate": { "type": "string", "title": "Audiobitrate", "default": "64k" }, "speakingRate": { "type": "number", "title": "Speakingrate", "default": 1 }, "pitch": { "type": "number", "title": "Pitch", "default": 0 }, "volumeGainDb": { "type": "number", "title": "Volumegaindb", "default": 0 }, "sampleRateHertz": { "type": "integer", "title": "Sampleratehertz", "default": 24000 }, "batchSize": { "type": "integer", "title": "Batchsize", "default": 4 }, "spliterThreshold": { "type": "integer", "title": "Spliterthreshold", "default": 100 } }, "type": "object", "title": "AudioConfig" }, "AudioFormat": { "type": "string", "enum": [ "mp3", "wav", "ogg", "acc", "flac", "raw" ], "title": "AudioFormat" }, "AudioReference": { "properties": { "wav_b64": { "type": "string", "title": "Wav B64" }, "text": { "type": "string", "title": "Text" }, "emotion": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Emotion", "default": "default" } }, "type": "object", "required": [ "wav_b64", "text" ], "title": "AudioReference" }, "AudioSpeechParams": { "properties": { "input": { "type": "string", "title": "Input" }, "model": { "type": "string", "title": "Model", "default": "chat-tts" }, "voice": { "type": "string", "title": "Voice", "default": "female2" }, "response_format": { "allOf": [ { "$ref": "#/components/schemas/AudioFormat" } ], "default": "raw" }, "speed": { "type": "number", "maximum": 10.0, "minimum": 0.1, "title": "Speed", "description": "Speed of the audio", "default": 1 }, "seed": { "type": "integer", "title": "Seed", "default": 42 }, "temperature": { "type": "number", "title": "Temperature", "default": 0.3 }, "top_k": { "type": "integer", "title": "Top K", "default": 20 }, "top_p": { "type": "number", "title": "Top P", "default": 0.7 }, "style": { "type": "string", "title": "Style", "default": "" }, "batch_size": { "type": "integer", "maximum": 20.0, "minimum": 1.0, "title": "Batch Size", "description": "Batch size", "default": 1 }, "spliter_threshold": { "type": "number", "maximum": 1024.0, "minimum": 10.0, "title": "Spliter Threshold", "description": "Threshold for sentence spliter", "default": 100 }, "eos": { "type": "string", "title": "Eos", "default": "[uv_break]" }, "enhance": { "type": "boolean", "title": "Enhance", "default": false }, "denoise": { "type": "boolean", "title": "Denoise", "default": false }, "stream": { "type": "boolean", "title": "Stream", "default": false }, "bitrate": { "type": "string", "title": "Bitrate", "default": "64k" } }, "type": "object", "required": [ "input" ], "title": "AudioSpeechParams" }, "BaseResponse": { "properties": { "message": { "type": "string", "title": "Message" }, "data": { "title": "Data" } }, "type": "object", "required": [ "message", "data" ], "title": "BaseResponse" }, "Body_transcribe_stream_v1_stt_stream_post": { "properties": { "file": { "type": "string", "format": "binary", "title": "File" }, "model": { "type": "string", "title": "Model", "default": "whisper.large" }, "prompt": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prompt" }, "prefix": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prefix" }, "language": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Language" }, "temperature": { "anyOf": [ { "type": "number" }, { "type": "null" } ], "title": "Temperature" }, "sample_len": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Sample Len" }, "best_of": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Best Of" }, "beam_size": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Beam Size" }, "patience": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Patience" }, "length_penalty": { "anyOf": [ { "type": "number" }, { "type": "null" } ], "title": "Length Penalty" }, "format": { "anyOf": [ { "$ref": "#/components/schemas/STTOutputFormat" }, { "type": "null" } ], "default": "txt" }, "highlight_words": { "anyOf": [ { "type": "boolean" }, { "type": "null" } ], "title": "Highlight Words", "default": false }, "max_line_count": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Line Count" }, "max_line_width": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Line Width" }, "max_words_per_line": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Words Per Line" } }, "type": "object", "required": [ "file" ], "title": "Body_transcribe_stream_v1_stt_stream_post" }, "Body_transcribe_v1_audio_transcriptions_post": { "properties": { "file": { "type": "string", "format": "binary", "title": "File" }, "model": { "type": "string", "title": "Model", "default": "whisper.large" }, "language": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Language" }, "prompt": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prompt" }, "response_format": { "type": "string", "title": "Response Format", "default": "txt" }, "temperature": { "type": "number", "title": "Temperature", "default": 0 }, "timestamp_granularities": { "items": { "type": "string" }, "type": "array", "title": "Timestamp Granularities", "default": [ "segment" ] } }, "type": "object", "required": [ "file" ], "title": "Body_transcribe_v1_audio_transcriptions_post" }, "Body_transcribe_v1_stt_transcribe_post": { "properties": { "file": { "type": "string", "format": "binary", "title": "File" }, "model": { "type": "string", "title": "Model", "default": "whisper.large" }, "prompt": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prompt" }, "prefix": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prefix" }, "language": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Language" }, "temperature": { "anyOf": [ { "type": "number" }, { "type": "null" } ], "title": "Temperature" }, "sample_len": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Sample Len" }, "best_of": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Best Of" }, "beam_size": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Beam Size" }, "patience": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Patience" }, "length_penalty": { "anyOf": [ { "type": "number" }, { "type": "null" } ], "title": "Length Penalty" }, "format": { "anyOf": [ { "$ref": "#/components/schemas/STTOutputFormat" }, { "type": "null" } ], "default": "txt" }, "highlight_words": { "anyOf": [ { "type": "boolean" }, { "type": "null" } ], "title": "Highlight Words", "default": false }, "max_line_count": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Line Count" }, "max_line_width": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Line Width" }, "max_words_per_line": { "anyOf": [ { "type": "integer" }, { "type": "null" } ], "title": "Max Words Per Line" } }, "type": "object", "required": [ "file" ], "title": "Body_transcribe_v1_stt_transcribe_post" }, "Body_voice_clone_v1_vc_post": { "properties": { "src_audio": { "type": "string", "format": "binary", "title": "Src Audio" }, "ref_audio": { "anyOf": [ { "type": "string", "format": "binary" }, { "type": "null" } ], "title": "Ref Audio" } }, "type": "object", "required": [ "src_audio" ], "title": "Body_voice_clone_v1_vc_post" }, "CreateSpeaker": { "properties": { "name": { "type": "string", "title": "Name" }, "gender": { "type": "string", "title": "Gender", "default": "" }, "author": { "type": "string", "title": "Author", "default": "" }, "desc": { "type": "string", "title": "Desc", "default": "" }, "version": { "type": "string", "title": "Version", "default": "" }, "wavs": { "anyOf": [ { "items": { "$ref": "#/components/schemas/AudioReference" }, "type": "array" }, { "type": "null" } ], "title": "Wavs" }, "save_file": { "type": "boolean", "title": "Save File", "default": false } }, "type": "object", "required": [ "name" ], "title": "CreateSpeaker" }, "EncoderConfig": { "properties": { "format": { "allOf": [ { "$ref": "#/components/schemas/AudioFormat" } ], "default": "raw" }, "bitrate": { "type": "string", "title": "Bitrate", "default": "64k" }, "acodec": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Acodec" } }, "type": "object", "title": "EncoderConfig" }, "EnhancerConfig": { "properties": { "enabled": { "type": "boolean", "title": "Enabled", "default": false }, "model": { "type": "string", "title": "Model", "default": "resemble-enhance" }, "nfe": { "type": "integer", "title": "Nfe", "default": 32 }, "solver": { "type": "string", "enum": [ "midpoint", "rk4", "euler" ], "title": "Solver", "default": "midpoint" }, "lambd": { "type": "number", "title": "Lambd", "default": 0.5 }, "tau": { "type": "number", "title": "Tau", "default": 0.5 } }, "type": "object", "title": "EnhancerConfig" }, "FromUrl": { "properties": { "url": { "type": "string", "maxLength": 2083, "minLength": 1, "format": "uri", "title": "Url", "description": "音频文件 URL,必须是合法的 http(s) 地址" }, "headers": { "anyOf": [ { "additionalProperties": { "type": "string" }, "type": "object" }, { "type": "null" } ], "title": "Headers", "description": "请求 URL 时附带的自定义 header" } }, "type": "object", "required": [ "url" ], "title": "FromUrl" }, "GoogleTextSynthesizeParams": { "properties": { "input": { "$ref": "#/components/schemas/SynthesisInput" }, "voice": { "$ref": "#/components/schemas/VoiceSelectionParams" }, "audioConfig": { "$ref": "#/components/schemas/AudioConfig" }, "enhancerConfig": { "allOf": [ { "$ref": "#/components/schemas/EnhancerConfig" } ] } }, "type": "object", "required": [ "input", "voice", "audioConfig" ], "title": "GoogleTextSynthesizeParams" }, "GoogleTextSynthesizeResponse": { "properties": { "audioContent": { "type": "string", "title": "Audiocontent" } }, "type": "object", "required": [ "audioContent" ], "title": "GoogleTextSynthesizeResponse" }, "HTTPValidationError": { "properties": { "detail": { "items": { "$ref": "#/components/schemas/ValidationError" }, "type": "array", "title": "Detail" } }, "type": "object", "title": "HTTPValidationError" }, "InferConfig": { "properties": { "batch_size": { "type": "integer", "title": "Batch Size", "default": 4 }, "spliter_threshold": { "type": "integer", "title": "Spliter Threshold", "default": 100 }, "eos": { "type": "string", "title": "Eos", "default": "。" }, "seed": { "type": "integer", "title": "Seed", "default": 42 }, "stream": { "type": "boolean", "title": "Stream", "default": false }, "stream_chunk_size": { "type": "integer", "title": "Stream Chunk Size", "default": 64 }, "no_cache": { "type": "boolean", "title": "No Cache", "default": false }, "sync_gen": { "type": "boolean", "title": "Sync Gen", "default": false }, "timeout": { "type": "integer", "title": "Timeout", "default": 900 } }, "type": "object", "title": "InferConfig" }, "InputAudio": { "properties": { "from_url": { "anyOf": [ { "$ref": "#/components/schemas/FromUrl" }, { "type": "null" } ], "description": "从 URL 加载音频" }, "from_base64": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "From Base64", "description": "base64 编码的音频数据" } }, "type": "object", "title": "InputAudio" }, "RefineTextRequest": { "properties": { "text": { "type": "string", "title": "Text" }, "prompt": { "type": "string", "title": "Prompt", "default": "[oral_2][laugh_0][break_6]" }, "seed": { "type": "integer", "title": "Seed", "default": -1 }, "top_P": { "type": "number", "title": "Top P", "default": 0.7 }, "top_K": { "type": "integer", "title": "Top K", "default": 20 }, "temperature": { "type": "number", "title": "Temperature", "default": 0.7 }, "repetition_penalty": { "type": "number", "title": "Repetition Penalty", "default": 1.0 }, "max_new_token": { "type": "integer", "title": "Max New Token", "default": 384 }, "spliter_threshold": { "type": "integer", "title": "Spliter Threshold", "default": 300 }, "normalize": { "type": "boolean", "title": "Normalize", "default": true } }, "type": "object", "required": [ "text" ], "title": "RefineTextRequest" }, "SSMLParams": { "properties": { "ssml": { "type": "string", "title": "Ssml" }, "format": { "allOf": [ { "$ref": "#/components/schemas/AudioFormat" } ], "default": "raw" }, "batch_size": { "type": "integer", "title": "Batch Size", "default": 4 }, "eos": { "type": "string", "title": "Eos", "default": "[uv_break]" }, "model": { "type": "string", "title": "Model", "default": "chat-tts" }, "spliter_thr": { "type": "integer", "title": "Spliter Thr", "default": 100 }, "enhancer": { "allOf": [ { "$ref": "#/components/schemas/EnhancerConfig" } ], "default": { "enabled": false, "model": "resemble-enhance", "nfe": 32, "solver": "midpoint", "lambd": 0.5, "tau": 0.5 } }, "adjuster": { "allOf": [ { "$ref": "#/components/schemas/AdjustConfig" } ], "default": { "pitch": 0.0, "speed_rate": 1.0, "volume_gain_db": 0.0, "normalize": true, "headroom": 1.0, "remove_silence": false, "remove_silence_threshold": -42.0 } }, "stream": { "type": "boolean", "title": "Stream", "default": false } }, "type": "object", "required": [ "ssml" ], "title": "SSMLParams" }, "STTOutputFormat": { "type": "string", "enum": [ "txt", "vtt", "srt", "tsv", "lrc", "json" ], "title": "STTOutputFormat" }, "SpeakerConfig": { "properties": { "from_spk_id": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "From Spk Id" }, "from_spk_name": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "From Spk Name" }, "from_ref": { "anyOf": [ { "$ref": "#/components/schemas/SpeakerReference" }, { "type": "null" } ] } }, "type": "object", "title": "SpeakerConfig", "description": "任选其中一种形式指定 spk" }, "SpeakerDetail": { "properties": { "id": { "type": "string", "title": "Id" }, "with_emb": { "type": "boolean", "title": "With Emb", "default": false } }, "type": "object", "required": [ "id" ], "title": "SpeakerDetail" }, "SpeakerReference": { "properties": { "wav_b64": { "type": "string", "title": "Wav B64" }, "text": { "type": "string", "title": "Text" } }, "type": "object", "required": [ "wav_b64", "text" ], "title": "SpeakerReference" }, "SpeakersUpdate": { "properties": { "speakers": { "items": { "type": "object" }, "type": "array", "title": "Speakers" } }, "type": "object", "required": [ "speakers" ], "title": "SpeakersUpdate" }, "SubtitleSegment": { "prefixItems": [ { "type": "string", "title": "Start" }, { "type": "string", "title": "End" }, { "type": "string", "title": "Text" }, { "items": {}, "type": "array", "title": "Words" }, { "type": "number", "title": "Start S" }, { "type": "number", "title": "End S" } ], "type": "array", "maxItems": 6, "minItems": 6 }, "SynthesisInput": { "properties": { "text": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Text" }, "ssml": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Ssml" } }, "type": "object", "title": "SynthesisInput" }, "SynthesisParams": { "properties": { "text": { "type": "string", "title": "Text" }, "speaker_wav": { "type": "string", "title": "Speaker Wav" }, "language": { "type": "string", "title": "Language", "default": "cn" } }, "type": "object", "required": [ "text", "speaker_wav" ], "title": "SynthesisParams" }, "TNConfig": { "properties": { "enabled": { "anyOf": [ { "items": { "type": "string" }, "type": "array" }, { "type": "null" } ], "title": "Enabled" }, "disabled": { "anyOf": [ { "items": { "type": "string" }, "type": "array" }, { "type": "null" } ], "title": "Disabled" } }, "type": "object", "title": "TNConfig" }, "TTSConfig": { "properties": { "mid": { "type": "string", "title": "Mid", "default": "chat-tts" }, "style": { "type": "string", "title": "Style", "default": "" }, "temperature": { "type": "number", "title": "Temperature", "default": 0.3 }, "top_p": { "type": "number", "title": "Top P", "default": 0.7 }, "top_k": { "type": "integer", "title": "Top K", "default": 20 }, "prompt": { "type": "string", "title": "Prompt", "default": "" }, "prompt1": { "type": "string", "title": "Prompt1", "default": "" }, "prompt2": { "type": "string", "title": "Prompt2", "default": "" }, "prefix": { "type": "string", "title": "Prefix", "default": "" }, "emotion": { "type": "string", "title": "Emotion", "default": "" } }, "type": "object", "title": "TTSConfig" }, "TTSSettingsRequest": { "properties": { "stream_chunk_size": { "type": "integer", "title": "Stream Chunk Size" }, "temperature": { "type": "number", "title": "Temperature" }, "speed": { "type": "number", "title": "Speed" }, "length_penalty": { "type": "number", "title": "Length Penalty" }, "repetition_penalty": { "type": "number", "title": "Repetition Penalty" }, "top_p": { "type": "number", "title": "Top P" }, "top_k": { "type": "integer", "title": "Top K" }, "enable_text_splitting": { "type": "boolean", "title": "Enable Text Splitting" }, "batch_size": { "type": "integer", "title": "Batch Size" }, "eos": { "type": "string", "title": "Eos" }, "infer_seed": { "type": "integer", "title": "Infer Seed" }, "use_decoder": { "type": "boolean", "title": "Use Decoder" }, "prompt1": { "type": "string", "title": "Prompt1" }, "prompt2": { "type": "string", "title": "Prompt2" }, "prefix": { "type": "string", "title": "Prefix" }, "spliter_threshold": { "type": "integer", "title": "Spliter Threshold" }, "style": { "type": "string", "title": "Style" } }, "type": "object", "required": [ "stream_chunk_size", "temperature", "speed", "length_penalty", "repetition_penalty", "top_p", "top_k", "enable_text_splitting" ], "title": "TTSSettingsRequest" }, "TextNormalizeRequest": { "properties": { "text": { "type": "string", "title": "Text" }, "pipe_id": { "type": "string", "enum": [ "base", "chat-tts", "cosy-voice", "fish-speech", "f5-tts", "index-tts", "spark-tts" ], "title": "Pipe Id", "default": "base" }, "config": { "anyOf": [ { "$ref": "#/components/schemas/TNConfig" }, { "type": "null" } ] } }, "type": "object", "required": [ "text" ], "title": "TextNormalizeRequest" }, "TranscribeResult": { "properties": { "text": { "type": "string", "title": "Text" }, "segments": { "items": { "$ref": "#/components/schemas/SubtitleSegment" }, "type": "array", "title": "Segments" }, "language": { "type": "string", "title": "Language" } }, "type": "object", "required": [ "text", "segments", "language" ], "title": "TranscribeResult" }, "TranscriptionRequest": { "properties": { "input_audio": { "$ref": "#/components/schemas/InputAudio" }, "model": { "type": "string", "title": "Model", "description": "ASR 模型名称,如 whisper.base、whisper.large", "default": "whisper.large" }, "refrence_transcript": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Refrence Transcript", "description": "参考文案" }, "prompt": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prompt", "description": "提示词,用于引导模型" }, "prefix": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Prefix", "description": "对话历史或固定开头" }, "language": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Language", "description": "语言代码,如 'en', 'zh' 等" }, "temperature": { "anyOf": [ { "type": "number", "maximum": 1.0, "minimum": 0.0 }, { "type": "null" } ], "title": "Temperature", "description": "采样温度,控制多样性,范围 0~1" }, "sample_len": { "anyOf": [ { "type": "integer", "minimum": 1.0 }, { "type": "null" } ], "title": "Sample Len", "description": "采样长度,必须 >= 1" }, "best_of": { "anyOf": [ { "type": "integer", "minimum": 1.0 }, { "type": "null" } ], "title": "Best Of", "description": "在 temperature > 0 时采样 n 次取最佳结果" }, "beam_size": { "anyOf": [ { "type": "integer", "minimum": 1.0 }, { "type": "null" } ], "title": "Beam Size", "description": "beam search 的宽度,推荐 5~10" }, "patience": { "anyOf": [ { "type": "number", "minimum": 0.0 }, { "type": "null" } ], "title": "Patience", "description": "beam search 的 patience 参数,越大越宽松" }, "length_penalty": { "anyOf": [ { "type": "number" }, { "type": "null" } ], "title": "Length Penalty", "description": "对生成文本长度的惩罚,负值鼓励短输出" }, "format": { "allOf": [ { "$ref": "#/components/schemas/STTOutputFormat" } ], "description": "输出格式,如 txt, json, srt, vtt", "default": "txt" }, "highlight_words": { "anyOf": [ { "type": "boolean" }, { "type": "null" } ], "title": "Highlight Words", "description": "是否高亮每个识别的单词(如 JSON 格式)", "default": false }, "max_line_count": { "anyOf": [ { "type": "integer", "exclusiveMinimum": 0.0 }, { "type": "null" } ], "title": "Max Line Count", "description": "最大行数限制(用于格式化输出)" }, "max_line_width": { "anyOf": [ { "type": "integer", "exclusiveMinimum": 0.0 }, { "type": "null" } ], "title": "Max Line Width", "description": "最大每行字符数(用于 txt 格式)" }, "max_words_per_line": { "anyOf": [ { "type": "integer", "exclusiveMinimum": 0.0 }, { "type": "null" } ], "title": "Max Words Per Line", "description": "每行最多几个词(用于格式化输出)" } }, "type": "object", "required": [ "input_audio" ], "title": "TranscriptionRequest" }, "TranscriptionsResponse": { "properties": { "message": { "type": "string", "title": "Message" }, "data": { "$ref": "#/components/schemas/TranscribeResult" } }, "type": "object", "required": [ "message", "data" ], "title": "TranscriptionsResponse" }, "UpdateSpeaker": { "properties": { "json": { "anyOf": [ { "type": "object" }, { "type": "null" } ], "title": "Json" } }, "type": "object", "title": "UpdateSpeaker" }, "V2TtsParams": { "properties": { "adjust": { "anyOf": [ { "$ref": "#/components/schemas/AdjustConfig" }, { "type": "null" } ] }, "encoder": { "anyOf": [ { "$ref": "#/components/schemas/EncoderConfig" }, { "type": "null" } ] }, "enhance": { "anyOf": [ { "$ref": "#/components/schemas/EnhancerConfig" }, { "type": "null" } ] }, "infer": { "anyOf": [ { "$ref": "#/components/schemas/InferConfig" }, { "type": "null" } ] }, "vc": { "anyOf": [ { "$ref": "#/components/schemas/VCConfig" }, { "type": "null" } ] }, "tn": { "anyOf": [ { "$ref": "#/components/schemas/TNConfig" }, { "type": "null" } ] }, "tts": { "$ref": "#/components/schemas/TTSConfig" }, "spk": { "anyOf": [ { "$ref": "#/components/schemas/SpeakerConfig" }, { "type": "null" } ] }, "text": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Text" }, "texts": { "anyOf": [ { "items": { "type": "string" }, "type": "array" }, { "type": "null" } ], "title": "Texts" }, "ssml": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Ssml" } }, "type": "object", "title": "V2TtsParams" }, "VCConfig": { "properties": { "enabled": { "type": "boolean", "title": "Enabled", "default": false }, "mid": { "type": "string", "title": "Mid", "default": "open-voice" }, "emotion": { "anyOf": [ { "type": "string" }, { "type": "null" } ], "title": "Emotion" }, "tau": { "type": "number", "title": "Tau", "default": 0.3 } }, "type": "object", "title": "VCConfig" }, "ValidationError": { "properties": { "loc": { "items": { "anyOf": [ { "type": "string" }, { "type": "integer" } ] }, "type": "array", "title": "Location" }, "msg": { "type": "string", "title": "Message" }, "type": { "type": "string", "title": "Error Type" } }, "type": "object", "required": [ "loc", "msg", "type" ], "title": "ValidationError" }, "VoiceSelectionParams": { "properties": { "languageCode": { "type": "string", "title": "Languagecode", "default": "ZH-CN" }, "name": { "type": "string", "title": "Name", "default": "female2" }, "style": { "type": "string", "title": "Style", "default": "" }, "temperature": { "type": "number", "title": "Temperature", "default": 0.3 }, "topP": { "type": "number", "title": "Topp", "default": 0.7 }, "topK": { "type": "integer", "title": "Topk", "default": 20 }, "seed": { "type": "integer", "title": "Seed", "default": 42 }, "eos": { "type": "string", "title": "Eos", "default": "[uv_break]" }, "model": { "type": "string", "title": "Model", "default": "chat-tts" } }, "type": "object", "title": "VoiceSelectionParams" } } }, "servers": [ { "url": "http://127.0.0.1:7870" }, { "url": "http://0.0.0.0:7870" }, { "url": "http://localhost:7870" } ] }