# CyberVerse example configuration. # Copy this file to cyberverse_config.yaml and edit the local copy: # cp infra/cyberverse_config.example.yaml cyberverse_config.yaml # # Keep secrets in .env. Values such as ${DASHSCOPE_API_KEY} are expanded from # environment variables after .env is loaded. server: host: "0.0.0.0" http_port: 8080 grpc_port: 50051 cors_origins: ["*"] livekit: url: "${LIVEKIT_URL}" api_key: "${LIVEKIT_API_KEY}" api_secret: "${LIVEKIT_API_SECRET}" # Warm up inference components to reduce first-frame latency. warmup: enabled: true distributed: enabled: true # Enable warmup in multi-GPU mode. timeout_s: 30 # Timeout in seconds to prevent hangs. inference: avatar: # Set to false for pure voice sessions. Cached idle videos are still # served, but realtime speaking video and new idle video generation stop. enabled: true # Select the avatar backend initialized by the inference process. default: "flash_head" runtime: cuda_visible_devices: 0 # Use 0,1 for two GPUs. world_size: 1 # Match the number of visible GPUs. flash_head: plugin_class: "inference.plugins.avatar.flash_head_plugin.FlashHeadAvatarPlugin" models_dir: "models" checkpoint_dir: "./checkpoints/SoulX-FlashHead-1_3B" wav2vec_dir: "./checkpoints/wav2vec2-base-960h" model_type: "pro" device: "cuda:0" seed: 9999 compile_model: true compile_vae: true dist_worker_main_thread: true infer_params: frame_num: 33 motion_frames_latent_num: 2 tgt_fps: 20 sample_rate: 16000 sample_shift: 5 color_correction_strength: 1.0 cached_audio_duration: 8 num_heads: 12 height: 512 width: 512 live_act: plugin_class: "inference.plugins.avatar.live_act_plugin.LiveActAvatarPlugin" models_dir: "models/SoulX-LiveAct" ckpt_dir: "./checkpoints/LiveAct" wav2vec_dir: "./checkpoints/chinese-wav2vec2-base" seed: 42 t5_cpu: false fp8_kv_cache: false offload_cache: false block_offload: false mean_memory: false compile_wan_model: false compile_vae_decode: false dist_worker_main_thread: true default_prompt: "一个人在说话" infer_params: size: "320*480" fps: 20 audio_cfg: 1.0 omni: # Real realtime omni model providers only. default: "qwen_omni" doubao: plugin_class: "inference.plugins.voice_llm.doubao_realtime.DoubaoRealtimePlugin" access_token: "${DOUBAO_ACCESS_TOKEN}" app_id: "${DOUBAO_APP_ID}" voice_type: "zh_female_default" # Avoid DialogAudioIdleTimeoutError before FlashHead's first chunk arrives. end_smooth_window_ms: 6000 qwen_omni: plugin_class: "inference.plugins.voice_llm.qwen_omni_realtime.QwenOmniRealtimePlugin" api_key: "${DASHSCOPE_API_KEY}" model: "qwen3.5-omni-flash-realtime" voice: "Tina" input_sample_rate: 16000 output_sample_rate: 24000 vad_type: "semantic_vad" vad_threshold: 0.5 vad_silence_duration_ms: 800 persona: # PersonaAgent is the orchestration layer. It wraps a concrete omni # model provider and coordinates background tasks. persona: plugin_class: "inference.plugins.voice_llm.persona_agent.PersonaAgentPlugin" # Use a provider whose adapter exposes native hidden tool calls. # The qwen_omni adapter supports PersonaAgent task tools in this MVP. model_provider: "qwen_omni" # Defaults to data/tasks/langgraph_checkpoints.db when empty. checkpoint_db_path: "" llm: # PersonaAgent owns the local Supervisor LangGraph and sub-agent # runtime directly. Defaults to inference.llm.default; set this only # when background tasks should use a different text LLM. provider: "qwen" tools: zhihu: access_secret: "${ZHIHU_ACCESS_SECRET}" api_base: "https://developer.zhihu.com" timeout_seconds: 30 zhida_model: "zhida-fast-1p5" max_agent_iterations: 100 llm: # Used by standard mode and by PersonaAgent's local sub-agent runtime. # The global system prompt is owned by the Go orchestrator; do not # duplicate persona prompts here. default: "qwen" qwen: plugin_class: "inference.plugins.llm.qwen_plugin.QwenLLMPlugin" api_key: "${DASHSCOPE_API_KEY}" model: "qwen3.6-plus" temperature: 0.7 extra_body: enable_thinking: false openai: plugin_class: "inference.plugins.llm.openai_plugin.OpenAILLMPlugin" api_key: "${OPENAI_API_KEY}" model: "gpt-4o" temperature: 0.7 embedding: # Used by local character RAG indexes. default: "qwen" qwen: api_key: "${DASHSCOPE_API_KEY}" model: "text-embedding-v4" openai: api_key: "${OPENAI_API_KEY}" model: "text-embedding-3-small" tts: # Used by standard mode after the LLM response is generated. default: "qwen" qwen: plugin_class: "inference.plugins.tts.qwen_tts_plugin.QwenTTSPlugin" api_key: "${DASHSCOPE_API_KEY}" model: "qwen3-tts-flash-realtime" voice: "Momo" sample_rate: 24000 target_sample_rate: 16000 openai: plugin_class: "inference.plugins.tts.openai_tts_plugin.OpenAITTSPlugin" api_key: "${OPENAI_API_KEY}" model: "tts-1" voice: "nova" asr: # Used by standard mode to transcribe microphone input. default: "qwen" qwen: plugin_class: "inference.plugins.asr.qwen_asr_plugin.QwenASRPlugin" api_key: "${DASHSCOPE_API_KEY}" model: "qwen3-asr-flash-realtime" language: "auto" sample_rate: 16000 vad_threshold: 0.8 vad_silence_duration_ms: 600 whisper: plugin_class: "inference.plugins.asr.whisper_plugin.WhisperASRPlugin" model_size: "base" language: "auto" device: "cpu" session: max_concurrent: 4 idle_timeout_s: 300 max_duration_s: 3600 pipeline: default_mode: "omni" streaming_mode: "direct" # "direct" = P2P WebRTC, "livekit" = LiveKit SFU. rag: enabled: true top_k: 5 min_score: 0.25 max_context_chars: 4500 chunk_chars: 900 chunk_overlap_chars: 120 visual_input: enabled: true frame_interval_ms: 1000 max_width: 1280 max_height: 720 jpeg_quality: 0.78 max_frame_bytes: 524288 ws_max_message_bytes: 1048576 max_recent_frames: 2 frame_ttl_ms: 10000 turn_enabled: true turn_port: 8443 # TCP port for embedded TURN server, useful through SSH tunnels. turn_realm: "cyberverse" turn_username: "cyberverse" turn_password: "${TURN_PASSWORD}" ice_public_ip: "" # REQUIRED for remote Direct WebRTC: public hostname or IP (not 127.0.0.1). Used in turn:HOST:PORT sent to browsers. recording: # Enables per-turn MP4, raw WAV, and transcript files. enabled: true # Used when a session has no character-specific recording directory. # Character sessions are stored under data/characters/.../sessions/... output_dir: "./recordings" # x264 quality for recorded MP4 files. Lower means higher quality/larger files. crf: 23 inference_grpc: addr: "localhost:50051"