{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "#/components/schemas/AssistantSpeechWordProgressTiming",
  "title": "AssistantSpeechWordProgressTiming",
  "type": "object",
  "properties": {
    "type": {
      "type": "string",
      "description": "Discriminator for cursor-based word progress (e.g. Minimax subtitle data).",
      "enum": [
        "word-progress"
      ]
    },
    "wordsSpoken": {
      "type": "number",
      "description": "Number of words spoken so far in this turn."
    },
    "totalWords": {
      "type": "number",
      "description": "Total number of words sent to the TTS provider for this turn.\n\n**Important**: this value grows across events within a single turn because\nMinimax synthesizes audio incrementally as the LLM streams tokens. Treat\nit as \"best known total so far\" \u2014 it will stabilize once synthesis is\ncomplete.\n\nA value of `0` is a valid sentinel meaning \"not yet known\". This can occur\non the very first `assistant-speech` event of a turn if audio begins\nplaying before the TTS provider has confirmed word-count data. Clients\n**must** guard against divide-by-zero when computing a progress fraction:\n\n```ts\nconst pct = totalWords > 0 ? wordsSpoken / totalWords : 0;\n```"
    },
    "segment": {
      "type": "string",
      "description": "The text of the latest spoken segment (sentence or clause). Use this\nfor caption display \u2014 it corresponds to the chunk just confirmed by\nthe TTS provider, unlike `text` on the parent message which carries\nthe full turn text."
    },
    "segmentDurationMs": {
      "type": "number",
      "description": "Audio duration in milliseconds for the latest spoken segment. Pair\nwith `segment` to animate karaoke-style word reveals \u2014 divide the\nsegment text across this duration for approximate per-word timing."
    },
    "words": {
      "description": "Per-word timestamps for the latest spoken segment. Available when the\nTTS provider supports word-level timing (e.g. Minimax with\nsubtitle_type: \"word\"). Syllables from the provider are aggregated\ninto whole words with start/end times relative to the segment start.\n\nUse these for precise karaoke-style highlighting instead of\ninterpolating from segmentDurationMs.",
      "type": "array",
      "items": {
        "$ref": "#/components/schemas/AssistantSpeechWordTimestamp"
      }
    }
  },
  "required": [
    "type",
    "wordsSpoken",
    "totalWords"
  ]
}