{
"openapi": "3.1.0",
"info": {
"title": "Speech-AI Forge API",
"description": "\n🍦 Speech-AI-Forge 是一个围绕 TTS 生成模型 ChatTTS 开发的项目,实现了 API Server 和 基于 Gradio 的 WebUI。
\n🍦 Speech-AI-Forge is a project developed around the TTS generation model ChatTTS, implementing an API Server and a Gradio-based WebUI.\n\n项目地址: [https://github.com/lenML/Speech-AI-Forge](https://github.com/lenML/Speech-AI-Forge)\n\n> 所有生成音频的 POST api都无法在此页面调试,调试建议使用 playground
\n> All audio generation POST APIs cannot be debugged on this page, it is recommended to use playground for debugging\n\n> 如果你不熟悉本系统,建议从这个一键脚本开始,在colab中尝试一下:
\n> [https://colab.research.google.com/github/lenML/Speech-AI-Forge/blob/main/colab.ipynb](https://colab.research.google.com/github/lenML/Speech-AI-Forge/blob/main/colab.ipynb)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)\n ",
"version": "0.1.0"
},
"paths": {
"/v1/ping": {
"get": {
"tags": [
"System"
],
"summary": "Ping",
"description": "Health check",
"operationId": "ping_v1_ping_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/versions": {
"get": {
"tags": [
"System"
],
"summary": "Get Versions",
"description": "Get versions",
"operationId": "get_versions_v1_versions_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/audio_formats": {
"get": {
"tags": [
"System"
],
"summary": "Get Audio Formats",
"description": "Get audio encoder formats",
"operationId": "get_audio_formats_v1_audio_formats_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/models/reload": {
"get": {
"tags": [
"Models"
],
"summary": "Reload Models",
"operationId": "reload_models_v1_models_reload_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/models/unload": {
"get": {
"tags": [
"Models"
],
"summary": "Unload Models",
"operationId": "unload_models_v1_models_unload_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/models/list": {
"get": {
"tags": [
"Models"
],
"summary": "Unload Models",
"operationId": "unload_models_v1_models_list_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/styles/list": {
"get": {
"tags": [
"Style"
],
"summary": "List Styles",
"description": "**DEPRECATED**\nThis API is deprecated and will be removed in the future. We will replace it with the speaker system and prompt system.",
"operationId": "list_styles_v1_styles_list_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/speakers/list": {
"get": {
"tags": [
"Speaker"
],
"summary": "List Speakers",
"description": "List all available speakers with optional pagination and detail control.\n\n- `detailed`: If true, returns complete metadata including references and embeddings.\n- `offset` / `limit`: Support for paginated speaker listing.",
"operationId": "list_speakers_v1_speakers_list_get",
"parameters": [
{
"name": "detailed",
"in": "query",
"required": false,
"schema": {
"type": "boolean",
"default": false,
"title": "Detailed"
}
},
{
"name": "offset",
"in": "query",
"required": false,
"schema": {
"type": "integer",
"default": 0,
"title": "Offset"
}
},
{
"name": "limit",
"in": "query",
"required": false,
"schema": {
"type": "integer",
"default": 5,
"title": "Limit"
}
}
],
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/speakers/refresh": {
"post": {
"tags": [
"Speaker"
],
"summary": "Refresh Speakers",
"description": "Force reload of all speaker metadata from disk. \nUse this when files are modified externally or newly added.",
"operationId": "refresh_speakers_v1_speakers_refresh_post",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
}
}
}
},
"/v1/speakers/update": {
"post": {
"tags": [
"Speaker"
],
"summary": "Update Speakers",
"description": "Batch update multiple speakers by providing a list of speaker JSON configs.\n\nEach speaker must already exist (matched by ID). \nWill overwrite corresponding fields and persist changes to disk.",
"operationId": "update_speakers_v1_speakers_update_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SpeakersUpdate"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/speaker/create": {
"post": {
"tags": [
"Speaker"
],
"summary": "Create Speaker",
"description": "Create a new speaker profile with optional reference audios.\n\n- `name` is required and used as unique identifier.\n- `wavs` is a list of audio samples (base64-encoded) and reference texts for embedding.\n- `save_file`: If true, the speaker will be saved to disk and available after refresh.",
"operationId": "create_speaker_v1_speaker_create_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CreateSpeaker"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/speaker/update": {
"post": {
"tags": [
"Speaker"
],
"summary": "Update Speaker",
"description": "Update a single speaker's configuration by full JSON override.\n\nThe speaker must already exist (matched by ID). \nFields like name, gender, refs, etc., will be updated accordingly.",
"operationId": "update_speaker_v1_speaker_update_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/UpdateSpeaker"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/speaker/detail": {
"post": {
"tags": [
"Speaker"
],
"summary": "Speaker Detail",
"description": "Fetch metadata of a specific speaker by ID.\n\n- `with_emb`: If true, includes embedding vectors and all reference data.",
"operationId": "speaker_detail_v1_speaker_detail_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SpeakerDetail"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/tts": {
"get": {
"tags": [
"TTS"
],
"summary": "Synthesize Tts",
"description": "**Text-to-Speech Synthesis API (v1 - GET)**\n\nThis endpoint converts text into speech using GET request parameters.\nIt offers various options for customizing the voice, style, audio output, and processing.\n\n**Mandatory Parameter**:\n* `text`: The text string to be synthesized.\n\n**Speaker and Style Customization**:\n* `spk`: Specify the speaker by name or seed (e.g., \"female2\").\n* `style`: Define the speaking style (e.g., \"chat\").\n\n**Generation Control**:\n* `temperature`: Sampling temperature (0.0-1.0, e.g., 0.3). Controls randomness.\n* `top_p`: Nucleus sampling probability (0.0-1.0, e.g., 0.5).\n* `top_k`: Limits sampling to the K most likely next tokens (e.g., 20).\n* `seed`: Seed for reproducible generation (e.g., 42).\n* `prompt`, `prompt1`, `prompt2`: Optional text prompts to guide inference.\n* `prefix`: Optional text prefix for inference.\n* `bs` (batch_size): Batch size for processing (e.g., \"8\").\n* `thr` (threshold): Sentence splitter threshold (e.g., \"100\").\n* `eos`: End-of-sentence marker (e.g., \"[uv_break]\").\n\n**Output Audio Configuration**:\n* `format`: Desired audio output format. Supported: \"mp3\", \"wav\", \"raw\" (default).\n* `bitrate`: Audio bitrate for compressed formats (e.g., \"64k\").\n\n**Audio Adjustments**:\n* `speed`: Playback speed multiplier (e.g., 1.0 for normal).\n* `pitch`: Pitch adjustment (e.g., 0 for no change).\n* `volume_gain`: Volume gain in dB (e.g., 0 for no change).\n\n**Enhancements**:\n* `enhance`: Boolean to enable audio enhancement (default: false).\n* `denoise`: Boolean to enable audio denoising (default: false).\n\n**Streaming Output**:\n* `stream`: Boolean to enable streaming audio generation (default: false).\n* `chunk_size`: Size of chunks for streaming (e.g., 64, if stream is true).\n\n**Caching**:\n* `no_cache`: Boolean or \"on\"/\"off\" to disable caching (default: false).\n\n**Model Selection**:\n* `model`: Specify the TTS model ID to use (e.g., \"chat-tts\", \"cosy-voice\").\n\n**Response**:\n* **Success**: An audio file stream (`FileResponse`).\n* **Failure**: A JSON object detailing the error (e.g., validation errors, internal server error).\n\n**Note**: This v1 endpoint does *not* support voice cloning via reference audio. For voice cloning, please refer to the v2 API. Parameters like temperature, top_p, top_k, seed, prompts, and prefix might be overridden by specific speaker (`spk`) or style (`style`) configurations.",
"operationId": "synthesize_tts_v1_tts_get",
"parameters": [
{
"name": "text",
"in": "query",
"required": true,
"schema": {
"type": "string",
"title": "Text"
}
},
{
"name": "spk",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "female2",
"title": "Spk"
}
},
{
"name": "style",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "chat",
"title": "Style"
}
},
{
"name": "temperature",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 0.3,
"title": "Temperature"
}
},
{
"name": "top_p",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 0.5,
"title": "Top P"
}
},
{
"name": "top_k",
"in": "query",
"required": false,
"schema": {
"type": "integer",
"default": 20,
"title": "Top K"
}
},
{
"name": "seed",
"in": "query",
"required": false,
"schema": {
"type": "integer",
"default": 42,
"title": "Seed"
}
},
{
"name": "format",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "raw",
"title": "Format"
}
},
{
"name": "bitrate",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "64k",
"title": "Bitrate"
}
},
{
"name": "prompt",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "",
"title": "Prompt"
}
},
{
"name": "prompt1",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "",
"title": "Prompt1"
}
},
{
"name": "prompt2",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "",
"title": "Prompt2"
}
},
{
"name": "prefix",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "",
"title": "Prefix"
}
},
{
"name": "bs",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "8",
"title": "Bs"
}
},
{
"name": "thr",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "100",
"title": "Thr"
}
},
{
"name": "eos",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "[uv_break]",
"title": "Eos"
}
},
{
"name": "enhance",
"in": "query",
"required": false,
"schema": {
"type": "boolean",
"default": false,
"title": "Enhance"
}
},
{
"name": "denoise",
"in": "query",
"required": false,
"schema": {
"type": "boolean",
"default": false,
"title": "Denoise"
}
},
{
"name": "speed",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 1.0,
"title": "Speed"
}
},
{
"name": "pitch",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 0,
"title": "Pitch"
}
},
{
"name": "volume_gain",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 0,
"title": "Volume Gain"
}
},
{
"name": "stream",
"in": "query",
"required": false,
"schema": {
"type": "boolean",
"default": false,
"title": "Stream"
}
},
{
"name": "chunk_size",
"in": "query",
"required": false,
"schema": {
"type": "integer",
"default": 64,
"title": "Chunk Size"
}
},
{
"name": "no_cache",
"in": "query",
"required": false,
"schema": {
"anyOf": [
{
"type": "boolean"
},
{
"enum": [
"on",
"off"
],
"type": "string"
}
],
"default": false,
"title": "No Cache"
}
},
{
"name": "model",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "chat-tts",
"title": "Model"
}
}
],
"responses": {
"200": {
"description": "Successful Response"
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/ssml": {
"post": {
"tags": [
"SSML"
],
"summary": "Synthesize Ssml Api",
"description": "Synthesize speech from SSML-formatted input using a specified TTS model.\n\nThis endpoint supports multi-speaker, multi-style speech synthesis based on structured SSML input.\nIt can return audio in various formats (e.g., `raw`, `wav`, `mp3`), with optional enhancements and prosody adjustments.\n\n### Supported Features\n- Multiple speakers via ``\n- Arbitrary text segmentation with sentence break markers (e.g., `eos=\"[uv_break]\"`)\n- Streaming or full-response audio\n- Audio enhancement & pitch/speed adjustment via `EnhancerConfig` / `AdjustConfig`\n- Custom batch size & segment length control via `batch_size` / `spliter_thr`\n\n### Parameters\n- `ssml` (str): SSML XML string containing structured speech content (required)\n- `format` (str): Output audio format. One of: `raw`, `wav`, `mp3` (default: `raw`)\n- `batch_size` (int): Batch size for internal TTS inference, must be > 0\n- `eos` (str): End-of-sentence token for segmentation (default: `[uv_break]`)\n- `model` (str): TTS model identifier to be used (default: `chat-tts`)\n- `spliter_thr` (int): Threshold to split long texts (default: 100, minimum: 50)\n- `enhancer` (EnhancerConfig): Optional audio enhancer settings\n- `adjuster` (AdjustConfig): Optional pitch/speed/volume control\n- `stream` (bool): If true, returns a streaming response; otherwise, file response\n\n### Example SSML Input\n\n```xml\n\n ChatTTS 用于合成多角色多情感的有声书示例\n 黛玉冷笑道:\n 我说呢,亏了绊住,不然,早就飞起来了。\n 宝玉道:\n “只许和你玩,替你解闷。不过偶然到他那里,就说这些闲话。”\n “好没意思的话! 去不去,关我什么事儿? 又没叫你替我解闷儿 ,还许你不理我呢”\n 说着,便赌气回房去了。\n\n````\n\nThe endpoint returns a synthesized audio file or stream based on the provided SSML and configuration.",
"operationId": "synthesize_ssml_api_v1_ssml_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"allOf": [
{
"$ref": "#/components/schemas/SSMLParams"
}
],
"title": "Params",
"description": "JSON body with SSML string and format"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response"
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/text:synthesize": {
"post": {
"tags": [
"Google API"
],
"summary": "Google Text Synthesize",
"description": "google api document:
\n[https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize)\n\n- 多个属性在本系统中无用仅仅是为了兼容google api\n- voice 中的 topP, topK, temperature 为本系统中的参数\n- voice.name 即 speaker name (或者speaker seed)\n- voice.seed 为 infer seed (可在webui中测试具体作用)\n\n- 编码格式影响的是 audioContent 的二进制格式,所以所有format都是返回带有base64数据的json",
"operationId": "google_text_synthesize_v1_text_synthesize_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GoogleTextSynthesizeParams"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GoogleTextSynthesizeResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/speech:recognize": {
"post": {
"tags": [
"Google API"
],
"summary": "Speech Recognize",
"description": "Performs synchronous speech recognition: receive results after all audio has been sent and processed.",
"operationId": "speech_recognize_v1_speech_recognize_post",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
}
}
}
},
"/v1/speech:longrunningrecognize": {
"post": {
"tags": [
"Google API"
],
"summary": "Long Running Recognize",
"description": "Performs asynchronous speech recognition: receive results via the google.longrunning.Operations interface.",
"operationId": "long_running_recognize_v1_speech_longrunningrecognize_post",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
}
}
}
},
"/v1/audio/speech": {
"post": {
"tags": [
"OpenAI API"
],
"summary": "Openai Speech Api",
"description": "openai api document: \n[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)\n\n以下属性为本系统自定义属性,不在openai文档中:\n- batch_size: 是否开启batch合成,小于等于1表示不使用batch (不推荐)\n- spliter_threshold: 开启batch合成时,句子分割的阈值\n- style: 风格\n\n> model 可填任意值",
"operationId": "openai_speech_api_v1_audio_speech_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"allOf": [
{
"$ref": "#/components/schemas/AudioSpeechParams"
}
],
"title": "Params",
"description": "JSON body with model, input text, and voice"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/audio/transcriptions": {
"post": {
"tags": [
"OpenAI API"
],
"summary": "Transcribe",
"description": "Transcribes audio into the input language.",
"operationId": "transcribe_v1_audio_transcriptions_post",
"requestBody": {
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_transcribe_v1_audio_transcriptions_post"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/prompt/refine": {
"post": {
"tags": [
"Text"
],
"summary": "Refiner Prompt Post",
"description": "**DeprecationWarning**\n\nThis endpoint is deprecated and will be removed in the future.\n\nRequirements:\n- `chattts` model",
"operationId": "refiner_prompt_post_v1_prompt_refine_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RefineTextRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/text/normalize": {
"post": {
"tags": [
"Text"
],
"summary": "Text Normalize Post",
"description": "Normalize raw input text using a selected Text Normalization (TN) pipeline.\n\nThis endpoint supports different TN implementations to perform text normalization \n(e.g., expanding numbers, abbreviations, adding pauses or prosodic markers for TTS, etc.).\n\n### Parameters\n\n- `text` (str): The raw text to normalize.\n- `pipe_id` (str): The TN pipeline to use. Available options:\n - `base`\n - `chat-tts`\n - `cosy-voice`\n - `fish-speech`\n - `f5-tts`\n - `index-tts`\n - `spark-tts`\n- `config` (TNConfig, optional): Optional configuration to customize TN behavior for specific pipelines.\n\n### Returns\n\nA normalized version of the input text, suitable for use in speech synthesis or downstream NLP tasks.",
"operationId": "text_normalize_post_v1_text_normalize_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TextNormalizeRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/BaseResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/xtts_v2/speakers": {
"get": {
"tags": [
"XTTS"
],
"summary": "Speakers",
"description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)",
"operationId": "speakers_v1_xtts_v2_speakers_get",
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
}
}
}
},
"/v1/xtts_v2/tts_to_audio": {
"post": {
"tags": [
"XTTS"
],
"summary": "Tts To Audio",
"description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)",
"operationId": "tts_to_audio_v1_xtts_v2_tts_to_audio_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SynthesisParams"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response"
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/xtts_v2/tts_stream": {
"get": {
"tags": [
"XTTS"
],
"summary": "Tts Stream",
"description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)",
"operationId": "tts_stream_v1_xtts_v2_tts_stream_get",
"parameters": [
{
"name": "text",
"in": "query",
"required": true,
"schema": {
"type": "string",
"title": "Text"
}
},
{
"name": "speaker_wav",
"in": "query",
"required": true,
"schema": {
"type": "string",
"title": "Speaker Wav"
}
},
{
"name": "language",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "cn",
"title": "Language"
}
},
{
"name": "no_cache",
"in": "query",
"required": false,
"schema": {
"type": "boolean",
"default": false,
"title": "No Cache"
}
}
],
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/xtts_v2/set_tts_settings": {
"post": {
"tags": [
"XTTS"
],
"summary": "Set Tts Settings",
"description": "[[Click To XTTS_V2 API Document]](https://github.com/lenML/Speech-AI-Forge/blob/main/docs/api_xtts.md)\n\n[[Learn More About Documents]](https://github.com/lenML/Speech-AI-Forge/issues/240)",
"operationId": "set_tts_settings_v1_xtts_v2_set_tts_settings_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TTSSettingsRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/stt/transcribe": {
"post": {
"tags": [
"STT"
],
"summary": "Transcribe",
"description": "Transcribes audio into the input language.",
"operationId": "transcribe_v1_stt_transcribe_post",
"requestBody": {
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_transcribe_v1_stt_transcribe_post"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TranscriptionsResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/stt/stream": {
"post": {
"tags": [
"STT"
],
"summary": "Transcribe Stream",
"description": "Transcribes audio into the input language in real-time.\n\n* Not implemented yet (WIP)",
"operationId": "transcribe_stream_v1_stt_stream_post",
"requestBody": {
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_transcribe_stream_v1_stt_stream_post"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v1/vc": {
"post": {
"tags": [
"Voice Clone"
],
"summary": "Voice Clone",
"description": "Voice cloning API\n\n**Deprecated**\nThis API is deprecated and will be removed in the future.\nPlease use the `TTS API` instead.",
"operationId": "voice_clone_v1_vc_post",
"parameters": [
{
"name": "ref_spk",
"in": "query",
"required": false,
"schema": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Ref Spk"
}
},
{
"name": "spk_emotion",
"in": "query",
"required": false,
"schema": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Spk Emotion"
}
},
{
"name": "model",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "open-voice",
"title": "Model"
}
},
{
"name": "tau",
"in": "query",
"required": false,
"schema": {
"type": "number",
"default": 0.3,
"title": "Tau"
}
},
{
"name": "format",
"in": "query",
"required": false,
"schema": {
"type": "string",
"default": "mp3",
"title": "Format"
}
}
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_voice_clone_v1_vc_post"
}
}
}
},
"responses": {
"200": {
"description": "Successful Response"
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v2/tts": {
"post": {
"tags": [
"Forge V2"
],
"summary": "Forge Text Synthesize",
"description": "**Text-to-Speech Synthesis API (v2)**\n\nThis endpoint converts text into speech, offering a wide range of customization options.\nIt accepts JSON formatted data in the request body.\n\n**Core Functionality**:\n* Synthesizes speech from various text inputs.\n* Supports voice cloning by directly uploading a reference audio.\n\n**Input Flexibility**:\n* Provide text via one of these mutually exclusive fields:\n * `text`: A single string of text.\n * `texts`: A list of strings for batch processing.\n * `ssml`: Text formatted using Speech Synthesis Markup Language (SSML).\n\n**Speaker Customization (`spk`)**:\n* Use pre-defined speakers via `spk.from_spk_id` or `spk.from_spk_name`.\n* **Voice Cloning**: Provide reference audio for voice cloning:\n * `spk.from_ref.wav_b64`: Base64 encoded WAV audio data.\n * `spk.from_ref.text`: The transcript corresponding to the reference audio.\n\n**Audio Processing & Control**:\n* **Adjustments (`adjust`)**:\n * `pitch`: Modify audio pitch.\n * `speed_rate`: Adjust speaking rate.\n * `volume_gain_db`: Change audio volume.\n * `normalize`: Apply volume normalization.\n * `remove_silence`: Trim silence from audio ends.\n* **Encoding (`encoder`)**:\n * `format`: Output audio format (e.g., \"mp3\", \"wav\").\n * `bitrate`: Audio bitrate (e.g., \"64k\").\n * `acodec`: Audio codec (e.g., \"libmp3lame\").\n* **Enhancement (`enhance`)**:\n * `enabled`: Toggle audio enhancement.\n * `model`: Select enhancement model.\n * Additional parameters like `nfe`, `solver`, `lambd`, `tau` for fine-tuning.\n* **Inference (`infer`)**:\n * `batch_size`, `spliter_threshold`, `eos` for text processing.\n * `seed`: For reproducible outputs.\n * `stream`, `stream_chunk_size`: Enable and configure streaming output.\n * `no_cache`, `sync_gen`: Control caching and generation mode.\n* **Text Normalization (`tn`)**:\n * `enabled`/`disabled`: Specify text normalization rules.\n* **TTS Model Parameters (`tts`)**:\n * `mid`: TTS model ID (e.g., \"cosy-voice\").\n * `style`, `emotion`: Control speaking style and emotion.\n * `temperature`, `top_p`, `top_k`: Adjust sampling parameters for generation.\n * Deprecated: `prompt`, `prompt1`, `prompt2`, `prefix`.\n* **Voice Cloning (`vc`)** (Note: Potentially deprecated, mainly for OpenVoice):\n * `enabled`, `mid`, `emotion`, `tau`.\n\n**Response**:\n* **Success**: An audio file stream (`FileResponse`).\n* **Failure**: A JSON object detailing the error.\n\n**Key Considerations**:\n1. Ensure only one of `text`, `texts`, or `ssml` is provided.\n2. For voice cloning, use high-quality reference audio (3-5 seconds recommended).\n3. Enabling audio enhancement may increase generation latency.",
"operationId": "forge_text_synthesize_v2_tts_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/V2TtsParams"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response"
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/v2/stt": {
"post": {
"tags": [
"Forge V2"
],
"summary": "Transcribe V2",
"description": "Transcribes audio using base64 or URL input. Stateless interface.",
"operationId": "transcribe_v2_v2_stt_post",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TranscriptionRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TranscriptionsResponse"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
}
},
"components": {
"schemas": {
"AdjustConfig": {
"properties": {
"pitch": {
"type": "number",
"title": "Pitch",
"default": 0
},
"speed_rate": {
"type": "number",
"title": "Speed Rate",
"default": 1
},
"volume_gain_db": {
"type": "number",
"title": "Volume Gain Db",
"default": 0
},
"normalize": {
"type": "boolean",
"title": "Normalize",
"default": true
},
"headroom": {
"type": "number",
"title": "Headroom",
"default": 1
},
"remove_silence": {
"type": "boolean",
"title": "Remove Silence",
"default": false
},
"remove_silence_threshold": {
"type": "number",
"title": "Remove Silence Threshold",
"default": -42
}
},
"type": "object",
"title": "AdjustConfig"
},
"AudioConfig": {
"properties": {
"audioEncoding": {
"allOf": [
{
"$ref": "#/components/schemas/AudioFormat"
}
],
"default": "raw"
},
"audioBitrate": {
"type": "string",
"title": "Audiobitrate",
"default": "64k"
},
"speakingRate": {
"type": "number",
"title": "Speakingrate",
"default": 1
},
"pitch": {
"type": "number",
"title": "Pitch",
"default": 0
},
"volumeGainDb": {
"type": "number",
"title": "Volumegaindb",
"default": 0
},
"sampleRateHertz": {
"type": "integer",
"title": "Sampleratehertz",
"default": 24000
},
"batchSize": {
"type": "integer",
"title": "Batchsize",
"default": 4
},
"spliterThreshold": {
"type": "integer",
"title": "Spliterthreshold",
"default": 100
}
},
"type": "object",
"title": "AudioConfig"
},
"AudioFormat": {
"type": "string",
"enum": [
"mp3",
"wav",
"ogg",
"acc",
"flac",
"raw"
],
"title": "AudioFormat"
},
"AudioReference": {
"properties": {
"wav_b64": {
"type": "string",
"title": "Wav B64"
},
"text": {
"type": "string",
"title": "Text"
},
"emotion": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Emotion",
"default": "default"
}
},
"type": "object",
"required": [
"wav_b64",
"text"
],
"title": "AudioReference"
},
"AudioSpeechParams": {
"properties": {
"input": {
"type": "string",
"title": "Input"
},
"model": {
"type": "string",
"title": "Model",
"default": "chat-tts"
},
"voice": {
"type": "string",
"title": "Voice",
"default": "female2"
},
"response_format": {
"allOf": [
{
"$ref": "#/components/schemas/AudioFormat"
}
],
"default": "raw"
},
"speed": {
"type": "number",
"maximum": 10.0,
"minimum": 0.1,
"title": "Speed",
"description": "Speed of the audio",
"default": 1
},
"seed": {
"type": "integer",
"title": "Seed",
"default": 42
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0.3
},
"top_k": {
"type": "integer",
"title": "Top K",
"default": 20
},
"top_p": {
"type": "number",
"title": "Top P",
"default": 0.7
},
"style": {
"type": "string",
"title": "Style",
"default": ""
},
"batch_size": {
"type": "integer",
"maximum": 20.0,
"minimum": 1.0,
"title": "Batch Size",
"description": "Batch size",
"default": 1
},
"spliter_threshold": {
"type": "number",
"maximum": 1024.0,
"minimum": 10.0,
"title": "Spliter Threshold",
"description": "Threshold for sentence spliter",
"default": 100
},
"eos": {
"type": "string",
"title": "Eos",
"default": "[uv_break]"
},
"enhance": {
"type": "boolean",
"title": "Enhance",
"default": false
},
"denoise": {
"type": "boolean",
"title": "Denoise",
"default": false
},
"stream": {
"type": "boolean",
"title": "Stream",
"default": false
},
"bitrate": {
"type": "string",
"title": "Bitrate",
"default": "64k"
}
},
"type": "object",
"required": [
"input"
],
"title": "AudioSpeechParams"
},
"BaseResponse": {
"properties": {
"message": {
"type": "string",
"title": "Message"
},
"data": {
"title": "Data"
}
},
"type": "object",
"required": [
"message",
"data"
],
"title": "BaseResponse"
},
"Body_transcribe_stream_v1_stt_stream_post": {
"properties": {
"file": {
"type": "string",
"format": "binary",
"title": "File"
},
"model": {
"type": "string",
"title": "Model",
"default": "whisper.large"
},
"prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prompt"
},
"prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prefix"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Language"
},
"temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"title": "Temperature"
},
"sample_len": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Sample Len"
},
"best_of": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Best Of"
},
"beam_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Beam Size"
},
"patience": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Patience"
},
"length_penalty": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"title": "Length Penalty"
},
"format": {
"anyOf": [
{
"$ref": "#/components/schemas/STTOutputFormat"
},
{
"type": "null"
}
],
"default": "txt"
},
"highlight_words": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"title": "Highlight Words",
"default": false
},
"max_line_count": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Line Count"
},
"max_line_width": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Line Width"
},
"max_words_per_line": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Words Per Line"
}
},
"type": "object",
"required": [
"file"
],
"title": "Body_transcribe_stream_v1_stt_stream_post"
},
"Body_transcribe_v1_audio_transcriptions_post": {
"properties": {
"file": {
"type": "string",
"format": "binary",
"title": "File"
},
"model": {
"type": "string",
"title": "Model",
"default": "whisper.large"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Language"
},
"prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prompt"
},
"response_format": {
"type": "string",
"title": "Response Format",
"default": "txt"
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0
},
"timestamp_granularities": {
"items": {
"type": "string"
},
"type": "array",
"title": "Timestamp Granularities",
"default": [
"segment"
]
}
},
"type": "object",
"required": [
"file"
],
"title": "Body_transcribe_v1_audio_transcriptions_post"
},
"Body_transcribe_v1_stt_transcribe_post": {
"properties": {
"file": {
"type": "string",
"format": "binary",
"title": "File"
},
"model": {
"type": "string",
"title": "Model",
"default": "whisper.large"
},
"prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prompt"
},
"prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prefix"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Language"
},
"temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"title": "Temperature"
},
"sample_len": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Sample Len"
},
"best_of": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Best Of"
},
"beam_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Beam Size"
},
"patience": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Patience"
},
"length_penalty": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"title": "Length Penalty"
},
"format": {
"anyOf": [
{
"$ref": "#/components/schemas/STTOutputFormat"
},
{
"type": "null"
}
],
"default": "txt"
},
"highlight_words": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"title": "Highlight Words",
"default": false
},
"max_line_count": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Line Count"
},
"max_line_width": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Line Width"
},
"max_words_per_line": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Max Words Per Line"
}
},
"type": "object",
"required": [
"file"
],
"title": "Body_transcribe_v1_stt_transcribe_post"
},
"Body_voice_clone_v1_vc_post": {
"properties": {
"src_audio": {
"type": "string",
"format": "binary",
"title": "Src Audio"
},
"ref_audio": {
"anyOf": [
{
"type": "string",
"format": "binary"
},
{
"type": "null"
}
],
"title": "Ref Audio"
}
},
"type": "object",
"required": [
"src_audio"
],
"title": "Body_voice_clone_v1_vc_post"
},
"CreateSpeaker": {
"properties": {
"name": {
"type": "string",
"title": "Name"
},
"gender": {
"type": "string",
"title": "Gender",
"default": ""
},
"author": {
"type": "string",
"title": "Author",
"default": ""
},
"desc": {
"type": "string",
"title": "Desc",
"default": ""
},
"version": {
"type": "string",
"title": "Version",
"default": ""
},
"wavs": {
"anyOf": [
{
"items": {
"$ref": "#/components/schemas/AudioReference"
},
"type": "array"
},
{
"type": "null"
}
],
"title": "Wavs"
},
"save_file": {
"type": "boolean",
"title": "Save File",
"default": false
}
},
"type": "object",
"required": [
"name"
],
"title": "CreateSpeaker"
},
"EncoderConfig": {
"properties": {
"format": {
"allOf": [
{
"$ref": "#/components/schemas/AudioFormat"
}
],
"default": "raw"
},
"bitrate": {
"type": "string",
"title": "Bitrate",
"default": "64k"
},
"acodec": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Acodec"
}
},
"type": "object",
"title": "EncoderConfig"
},
"EnhancerConfig": {
"properties": {
"enabled": {
"type": "boolean",
"title": "Enabled",
"default": false
},
"model": {
"type": "string",
"title": "Model",
"default": "resemble-enhance"
},
"nfe": {
"type": "integer",
"title": "Nfe",
"default": 32
},
"solver": {
"type": "string",
"enum": [
"midpoint",
"rk4",
"euler"
],
"title": "Solver",
"default": "midpoint"
},
"lambd": {
"type": "number",
"title": "Lambd",
"default": 0.5
},
"tau": {
"type": "number",
"title": "Tau",
"default": 0.5
}
},
"type": "object",
"title": "EnhancerConfig"
},
"FromUrl": {
"properties": {
"url": {
"type": "string",
"maxLength": 2083,
"minLength": 1,
"format": "uri",
"title": "Url",
"description": "音频文件 URL,必须是合法的 http(s) 地址"
},
"headers": {
"anyOf": [
{
"additionalProperties": {
"type": "string"
},
"type": "object"
},
{
"type": "null"
}
],
"title": "Headers",
"description": "请求 URL 时附带的自定义 header"
}
},
"type": "object",
"required": [
"url"
],
"title": "FromUrl"
},
"GoogleTextSynthesizeParams": {
"properties": {
"input": {
"$ref": "#/components/schemas/SynthesisInput"
},
"voice": {
"$ref": "#/components/schemas/VoiceSelectionParams"
},
"audioConfig": {
"$ref": "#/components/schemas/AudioConfig"
},
"enhancerConfig": {
"allOf": [
{
"$ref": "#/components/schemas/EnhancerConfig"
}
]
}
},
"type": "object",
"required": [
"input",
"voice",
"audioConfig"
],
"title": "GoogleTextSynthesizeParams"
},
"GoogleTextSynthesizeResponse": {
"properties": {
"audioContent": {
"type": "string",
"title": "Audiocontent"
}
},
"type": "object",
"required": [
"audioContent"
],
"title": "GoogleTextSynthesizeResponse"
},
"HTTPValidationError": {
"properties": {
"detail": {
"items": {
"$ref": "#/components/schemas/ValidationError"
},
"type": "array",
"title": "Detail"
}
},
"type": "object",
"title": "HTTPValidationError"
},
"InferConfig": {
"properties": {
"batch_size": {
"type": "integer",
"title": "Batch Size",
"default": 4
},
"spliter_threshold": {
"type": "integer",
"title": "Spliter Threshold",
"default": 100
},
"eos": {
"type": "string",
"title": "Eos",
"default": "。"
},
"seed": {
"type": "integer",
"title": "Seed",
"default": 42
},
"stream": {
"type": "boolean",
"title": "Stream",
"default": false
},
"stream_chunk_size": {
"type": "integer",
"title": "Stream Chunk Size",
"default": 64
},
"no_cache": {
"type": "boolean",
"title": "No Cache",
"default": false
},
"sync_gen": {
"type": "boolean",
"title": "Sync Gen",
"default": false
},
"timeout": {
"type": "integer",
"title": "Timeout",
"default": 900
}
},
"type": "object",
"title": "InferConfig"
},
"InputAudio": {
"properties": {
"from_url": {
"anyOf": [
{
"$ref": "#/components/schemas/FromUrl"
},
{
"type": "null"
}
],
"description": "从 URL 加载音频"
},
"from_base64": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "From Base64",
"description": "base64 编码的音频数据"
}
},
"type": "object",
"title": "InputAudio"
},
"RefineTextRequest": {
"properties": {
"text": {
"type": "string",
"title": "Text"
},
"prompt": {
"type": "string",
"title": "Prompt",
"default": "[oral_2][laugh_0][break_6]"
},
"seed": {
"type": "integer",
"title": "Seed",
"default": -1
},
"top_P": {
"type": "number",
"title": "Top P",
"default": 0.7
},
"top_K": {
"type": "integer",
"title": "Top K",
"default": 20
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0.7
},
"repetition_penalty": {
"type": "number",
"title": "Repetition Penalty",
"default": 1.0
},
"max_new_token": {
"type": "integer",
"title": "Max New Token",
"default": 384
},
"spliter_threshold": {
"type": "integer",
"title": "Spliter Threshold",
"default": 300
},
"normalize": {
"type": "boolean",
"title": "Normalize",
"default": true
}
},
"type": "object",
"required": [
"text"
],
"title": "RefineTextRequest"
},
"SSMLParams": {
"properties": {
"ssml": {
"type": "string",
"title": "Ssml"
},
"format": {
"allOf": [
{
"$ref": "#/components/schemas/AudioFormat"
}
],
"default": "raw"
},
"batch_size": {
"type": "integer",
"title": "Batch Size",
"default": 4
},
"eos": {
"type": "string",
"title": "Eos",
"default": "[uv_break]"
},
"model": {
"type": "string",
"title": "Model",
"default": "chat-tts"
},
"spliter_thr": {
"type": "integer",
"title": "Spliter Thr",
"default": 100
},
"enhancer": {
"allOf": [
{
"$ref": "#/components/schemas/EnhancerConfig"
}
],
"default": {
"enabled": false,
"model": "resemble-enhance",
"nfe": 32,
"solver": "midpoint",
"lambd": 0.5,
"tau": 0.5
}
},
"adjuster": {
"allOf": [
{
"$ref": "#/components/schemas/AdjustConfig"
}
],
"default": {
"pitch": 0.0,
"speed_rate": 1.0,
"volume_gain_db": 0.0,
"normalize": true,
"headroom": 1.0,
"remove_silence": false,
"remove_silence_threshold": -42.0
}
},
"stream": {
"type": "boolean",
"title": "Stream",
"default": false
}
},
"type": "object",
"required": [
"ssml"
],
"title": "SSMLParams"
},
"STTOutputFormat": {
"type": "string",
"enum": [
"txt",
"vtt",
"srt",
"tsv",
"lrc",
"json"
],
"title": "STTOutputFormat"
},
"SpeakerConfig": {
"properties": {
"from_spk_id": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "From Spk Id"
},
"from_spk_name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "From Spk Name"
},
"from_ref": {
"anyOf": [
{
"$ref": "#/components/schemas/SpeakerReference"
},
{
"type": "null"
}
]
}
},
"type": "object",
"title": "SpeakerConfig",
"description": "任选其中一种形式指定 spk"
},
"SpeakerDetail": {
"properties": {
"id": {
"type": "string",
"title": "Id"
},
"with_emb": {
"type": "boolean",
"title": "With Emb",
"default": false
}
},
"type": "object",
"required": [
"id"
],
"title": "SpeakerDetail"
},
"SpeakerReference": {
"properties": {
"wav_b64": {
"type": "string",
"title": "Wav B64"
},
"text": {
"type": "string",
"title": "Text"
}
},
"type": "object",
"required": [
"wav_b64",
"text"
],
"title": "SpeakerReference"
},
"SpeakersUpdate": {
"properties": {
"speakers": {
"items": {
"type": "object"
},
"type": "array",
"title": "Speakers"
}
},
"type": "object",
"required": [
"speakers"
],
"title": "SpeakersUpdate"
},
"SubtitleSegment": {
"prefixItems": [
{
"type": "string",
"title": "Start"
},
{
"type": "string",
"title": "End"
},
{
"type": "string",
"title": "Text"
},
{
"items": {},
"type": "array",
"title": "Words"
},
{
"type": "number",
"title": "Start S"
},
{
"type": "number",
"title": "End S"
}
],
"type": "array",
"maxItems": 6,
"minItems": 6
},
"SynthesisInput": {
"properties": {
"text": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Text"
},
"ssml": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Ssml"
}
},
"type": "object",
"title": "SynthesisInput"
},
"SynthesisParams": {
"properties": {
"text": {
"type": "string",
"title": "Text"
},
"speaker_wav": {
"type": "string",
"title": "Speaker Wav"
},
"language": {
"type": "string",
"title": "Language",
"default": "cn"
}
},
"type": "object",
"required": [
"text",
"speaker_wav"
],
"title": "SynthesisParams"
},
"TNConfig": {
"properties": {
"enabled": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"title": "Enabled"
},
"disabled": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"title": "Disabled"
}
},
"type": "object",
"title": "TNConfig"
},
"TTSConfig": {
"properties": {
"mid": {
"type": "string",
"title": "Mid",
"default": "chat-tts"
},
"style": {
"type": "string",
"title": "Style",
"default": ""
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0.3
},
"top_p": {
"type": "number",
"title": "Top P",
"default": 0.7
},
"top_k": {
"type": "integer",
"title": "Top K",
"default": 20
},
"prompt": {
"type": "string",
"title": "Prompt",
"default": ""
},
"prompt1": {
"type": "string",
"title": "Prompt1",
"default": ""
},
"prompt2": {
"type": "string",
"title": "Prompt2",
"default": ""
},
"prefix": {
"type": "string",
"title": "Prefix",
"default": ""
},
"emotion": {
"type": "string",
"title": "Emotion",
"default": ""
}
},
"type": "object",
"title": "TTSConfig"
},
"TTSSettingsRequest": {
"properties": {
"stream_chunk_size": {
"type": "integer",
"title": "Stream Chunk Size"
},
"temperature": {
"type": "number",
"title": "Temperature"
},
"speed": {
"type": "number",
"title": "Speed"
},
"length_penalty": {
"type": "number",
"title": "Length Penalty"
},
"repetition_penalty": {
"type": "number",
"title": "Repetition Penalty"
},
"top_p": {
"type": "number",
"title": "Top P"
},
"top_k": {
"type": "integer",
"title": "Top K"
},
"enable_text_splitting": {
"type": "boolean",
"title": "Enable Text Splitting"
},
"batch_size": {
"type": "integer",
"title": "Batch Size"
},
"eos": {
"type": "string",
"title": "Eos"
},
"infer_seed": {
"type": "integer",
"title": "Infer Seed"
},
"use_decoder": {
"type": "boolean",
"title": "Use Decoder"
},
"prompt1": {
"type": "string",
"title": "Prompt1"
},
"prompt2": {
"type": "string",
"title": "Prompt2"
},
"prefix": {
"type": "string",
"title": "Prefix"
},
"spliter_threshold": {
"type": "integer",
"title": "Spliter Threshold"
},
"style": {
"type": "string",
"title": "Style"
}
},
"type": "object",
"required": [
"stream_chunk_size",
"temperature",
"speed",
"length_penalty",
"repetition_penalty",
"top_p",
"top_k",
"enable_text_splitting"
],
"title": "TTSSettingsRequest"
},
"TextNormalizeRequest": {
"properties": {
"text": {
"type": "string",
"title": "Text"
},
"pipe_id": {
"type": "string",
"enum": [
"base",
"chat-tts",
"cosy-voice",
"fish-speech",
"f5-tts",
"index-tts",
"spark-tts"
],
"title": "Pipe Id",
"default": "base"
},
"config": {
"anyOf": [
{
"$ref": "#/components/schemas/TNConfig"
},
{
"type": "null"
}
]
}
},
"type": "object",
"required": [
"text"
],
"title": "TextNormalizeRequest"
},
"TranscribeResult": {
"properties": {
"text": {
"type": "string",
"title": "Text"
},
"segments": {
"items": {
"$ref": "#/components/schemas/SubtitleSegment"
},
"type": "array",
"title": "Segments"
},
"language": {
"type": "string",
"title": "Language"
}
},
"type": "object",
"required": [
"text",
"segments",
"language"
],
"title": "TranscribeResult"
},
"TranscriptionRequest": {
"properties": {
"input_audio": {
"$ref": "#/components/schemas/InputAudio"
},
"model": {
"type": "string",
"title": "Model",
"description": "ASR 模型名称,如 whisper.base、whisper.large",
"default": "whisper.large"
},
"refrence_transcript": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Refrence Transcript",
"description": "参考文案"
},
"prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prompt",
"description": "提示词,用于引导模型"
},
"prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Prefix",
"description": "对话历史或固定开头"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Language",
"description": "语言代码,如 'en', 'zh' 等"
},
"temperature": {
"anyOf": [
{
"type": "number",
"maximum": 1.0,
"minimum": 0.0
},
{
"type": "null"
}
],
"title": "Temperature",
"description": "采样温度,控制多样性,范围 0~1"
},
"sample_len": {
"anyOf": [
{
"type": "integer",
"minimum": 1.0
},
{
"type": "null"
}
],
"title": "Sample Len",
"description": "采样长度,必须 >= 1"
},
"best_of": {
"anyOf": [
{
"type": "integer",
"minimum": 1.0
},
{
"type": "null"
}
],
"title": "Best Of",
"description": "在 temperature > 0 时采样 n 次取最佳结果"
},
"beam_size": {
"anyOf": [
{
"type": "integer",
"minimum": 1.0
},
{
"type": "null"
}
],
"title": "Beam Size",
"description": "beam search 的宽度,推荐 5~10"
},
"patience": {
"anyOf": [
{
"type": "number",
"minimum": 0.0
},
{
"type": "null"
}
],
"title": "Patience",
"description": "beam search 的 patience 参数,越大越宽松"
},
"length_penalty": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"title": "Length Penalty",
"description": "对生成文本长度的惩罚,负值鼓励短输出"
},
"format": {
"allOf": [
{
"$ref": "#/components/schemas/STTOutputFormat"
}
],
"description": "输出格式,如 txt, json, srt, vtt",
"default": "txt"
},
"highlight_words": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"title": "Highlight Words",
"description": "是否高亮每个识别的单词(如 JSON 格式)",
"default": false
},
"max_line_count": {
"anyOf": [
{
"type": "integer",
"exclusiveMinimum": 0.0
},
{
"type": "null"
}
],
"title": "Max Line Count",
"description": "最大行数限制(用于格式化输出)"
},
"max_line_width": {
"anyOf": [
{
"type": "integer",
"exclusiveMinimum": 0.0
},
{
"type": "null"
}
],
"title": "Max Line Width",
"description": "最大每行字符数(用于 txt 格式)"
},
"max_words_per_line": {
"anyOf": [
{
"type": "integer",
"exclusiveMinimum": 0.0
},
{
"type": "null"
}
],
"title": "Max Words Per Line",
"description": "每行最多几个词(用于格式化输出)"
}
},
"type": "object",
"required": [
"input_audio"
],
"title": "TranscriptionRequest"
},
"TranscriptionsResponse": {
"properties": {
"message": {
"type": "string",
"title": "Message"
},
"data": {
"$ref": "#/components/schemas/TranscribeResult"
}
},
"type": "object",
"required": [
"message",
"data"
],
"title": "TranscriptionsResponse"
},
"UpdateSpeaker": {
"properties": {
"json": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"title": "Json"
}
},
"type": "object",
"title": "UpdateSpeaker"
},
"V2TtsParams": {
"properties": {
"adjust": {
"anyOf": [
{
"$ref": "#/components/schemas/AdjustConfig"
},
{
"type": "null"
}
]
},
"encoder": {
"anyOf": [
{
"$ref": "#/components/schemas/EncoderConfig"
},
{
"type": "null"
}
]
},
"enhance": {
"anyOf": [
{
"$ref": "#/components/schemas/EnhancerConfig"
},
{
"type": "null"
}
]
},
"infer": {
"anyOf": [
{
"$ref": "#/components/schemas/InferConfig"
},
{
"type": "null"
}
]
},
"vc": {
"anyOf": [
{
"$ref": "#/components/schemas/VCConfig"
},
{
"type": "null"
}
]
},
"tn": {
"anyOf": [
{
"$ref": "#/components/schemas/TNConfig"
},
{
"type": "null"
}
]
},
"tts": {
"$ref": "#/components/schemas/TTSConfig"
},
"spk": {
"anyOf": [
{
"$ref": "#/components/schemas/SpeakerConfig"
},
{
"type": "null"
}
]
},
"text": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Text"
},
"texts": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"title": "Texts"
},
"ssml": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Ssml"
}
},
"type": "object",
"title": "V2TtsParams"
},
"VCConfig": {
"properties": {
"enabled": {
"type": "boolean",
"title": "Enabled",
"default": false
},
"mid": {
"type": "string",
"title": "Mid",
"default": "open-voice"
},
"emotion": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Emotion"
},
"tau": {
"type": "number",
"title": "Tau",
"default": 0.3
}
},
"type": "object",
"title": "VCConfig"
},
"ValidationError": {
"properties": {
"loc": {
"items": {
"anyOf": [
{
"type": "string"
},
{
"type": "integer"
}
]
},
"type": "array",
"title": "Location"
},
"msg": {
"type": "string",
"title": "Message"
},
"type": {
"type": "string",
"title": "Error Type"
}
},
"type": "object",
"required": [
"loc",
"msg",
"type"
],
"title": "ValidationError"
},
"VoiceSelectionParams": {
"properties": {
"languageCode": {
"type": "string",
"title": "Languagecode",
"default": "ZH-CN"
},
"name": {
"type": "string",
"title": "Name",
"default": "female2"
},
"style": {
"type": "string",
"title": "Style",
"default": ""
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0.3
},
"topP": {
"type": "number",
"title": "Topp",
"default": 0.7
},
"topK": {
"type": "integer",
"title": "Topk",
"default": 20
},
"seed": {
"type": "integer",
"title": "Seed",
"default": 42
},
"eos": {
"type": "string",
"title": "Eos",
"default": "[uv_break]"
},
"model": {
"type": "string",
"title": "Model",
"default": "chat-tts"
}
},
"type": "object",
"title": "VoiceSelectionParams"
}
}
},
"servers": [
{
"url": "http://127.0.0.1:7870"
},
{
"url": "http://0.0.0.0:7870"
},
{
"url": "http://localhost:7870"
}
]
}