asyncapi: 2.6.0
defaultContentType: application/json
id: http://assemblyai.com/asyncapi

info:
  title: AssemblyAI Streaming Speech-to-Text API
  description: AssemblyAI Streaming Speech-to-Text API
  license:
    name: MIT License
  version: 1.1.2
  contact:
    name: API Support
    email: support@assemblyai.com
    url: https://www.assemblyai.com/docs/

servers:
  API:
    url: wss://api.assemblyai.com
    protocol: wss
    description: AssemblyAI API
    security:
      - ApiKey: []
      - Token: []

tags:
  - name: streaming
    description: Streaming Speech-to-Text
    externalDocs:
      url: https://www.assemblyai.com/docs/speech-to-text/streaming

channels:
  /v2/realtime/ws:
    bindings:
      ws:
        headers:
          type: object
          properties:
            Authentication:
              description: Authenticate using your AssemblyAI API key
              type: string
        query:
          type: object
          properties:
            sample_rate:
              description: The sample rate of the streamed audio
              type: integer
              examples:
                - 16000
            word_boost:
              description: |
                Add up to 2500 characters of custom vocabulary.
                The parameter value must be a JSON encoded array of strings.
              type: string
              examples:
                - '["foo","bar"]'
            encoding:
              description: The encoding of the audio data
              $ref: "#/components/schemas/AudioEncoding"
            token:
              description: |
                Authenticate using a [generated temporary token](https://www.assemblyai.com/docs/speech-to-text/streaming#authenticate-with-a-temporary-token)
              type: string
              format: password
              examples:
                - eyJhbGciOiJIUzI1...
            disable_partial_transcripts:
              summary: Disable partial transcripts
              description: Set to true to not receive partial transcripts. Defaults to false.
              type: boolean
              default: false
            enable_extra_session_information:
              summary: Enable extra session information
              description: Set to true to receive the SessionInformation message before the session ends. Defaults to false.
              type: boolean
              default: false
    publish:
      description: Send messages to the WebSocket
      operationId: sendMessage
      message:
        oneOf:
          - $ref: "#/components/messages/SendAudio"
          - $ref: "#/components/messages/TerminateSession"
          - $ref: "#/components/messages/ForceEndUtterance"
          - $ref: "#/components/messages/ConfigureEndUtteranceSilenceThreshold"

    subscribe:
      description: Receive messages from the WebSocket
      operationId: receiveMessage
      message:
        oneOf:
          - $ref: "#/components/messages/SessionBegins"
          - $ref: "#/components/messages/PartialTranscript"
          - $ref: "#/components/messages/FinalTranscript"
          - $ref: "#/components/messages/SessionInformation"
          - $ref: "#/components/messages/SessionTerminated"
          - $ref: "#/components/messages/RealtimeError"

    x-fern-examples:
      - query-parameters:
          sample_rate: 16000
          word_boost: '["foo","bar"]'
          encoding: pcm_s16le
          token: eyJhbGciOiJIUzI1...
          disable_partial_transcripts: true
          enable_extra_session_information: true
        messages:
          - messageId: SendAudio
            type: publish
            value: UklGRtjIAABXQVZFZ
          - messageId: PartialTranscript
            type: subscribe
            value:
              audio_start: 0
              audio_end: 1500
              confidence: 0.987190506414702
              text: there is a house in new orleans
              words:
                - start: 0
                  end: 300
                  confidence: 1.0
                  text: there
              created: "2023-05-24T08:09:10.161850"
          - messageId: FinalTranscript
            type: subscribe
            value:
              audio_start: 0
              audio_end: 1500
              confidence: 0.987190506414702
              text: there is a house in new orleans
              words:
                - start: 0
                  end: 300
                  confidence: 1.0
                  text: there
              created: "2023-05-24T08:09:10.161850"
              punctuated: true
              text_formatted: true
          - messageId: TerminateSession
            type: publish
            value:
              terminate_session: true
          - messageId: SessionTerminated
            type: subscribe
            value:
              message_type: "SessionTerminated"

components:
  messages:
    SendAudio:
      messageId: sendAudio
      summary: Send audio
      payload:
        $ref: "#/components/schemas/AudioData"
    ForceEndUtterance:
      messageId: forceEndUtterance
      summary: Manually end an utterance
      payload:
        $ref: "#/components/schemas/ForceEndUtterance"
    ConfigureEndUtteranceSilenceThreshold:
      messageId: configureEndUtteranceSilenceThreshold
      summary: Configure the threshold for how long to wait before ending an utterance. Default is 700ms.
      payload:
        $ref: "#/components/schemas/ConfigureEndUtteranceSilenceThreshold"
    TerminateSession:
      messageId: terminateSession
      summary: Terminate session
      payload:
        $ref: "#/components/schemas/TerminateSession"
    SessionBegins:
      messageId: SessionBegins
      summary: Session start
      payload:
        $ref: "#/components/schemas/SessionBegins"
    SessionInformation:
      messageId: SessionInformation
      summary: Information about the session
      description: |
        Information about the session that is concluding.
        This message is sent at the end of the session, before the SessionTerminated message.
      payload:
        $ref: "#/components/schemas/SessionInformation"
    SessionTerminated:
      messageId: SessionTerminated
      summary: Session terminated
      payload:
        $ref: "#/components/schemas/SessionTerminated"
    RealtimeError:
      messageId: RealtimeError
      summary: Error message
      payload:
        $ref: "#/components/schemas/RealtimeError"
    PartialTranscript:
      messageId: partialTranscript
      summary: As you send audio data to the API, the API immediately starts responding with Partial Transcript results.
      payload:
        $ref: "#/components/schemas/PartialTranscript"
    FinalTranscript:
      messageId: finalTranscript
      summary: Transcript text at the end of an utterance with punctuation and casing.
      description: After you've received your partial results, our model continues to analyze incoming audio and, when it detects the end of an "utterance" (usually a pause in speech), it'll finalize the results sent to you so far with higher accuracy, as well as add punctuation and casing to the transcription text.
      payload:
        $ref: "#/components/schemas/FinalTranscript"

  schemas:
    RealtimeBaseMessage:
      type: object
      x-fern-sdk-group-name: realtime
      required:
        - message_type
      properties:
        message_type:
          description: Describes the type of the message
          $ref: "#/components/schemas/MessageType"
    RealtimeMessage:
      x-fern-sdk-group-name: realtime
      oneOf:
        - $ref: "#/components/schemas/SessionBegins"
        - $ref: "#/components/schemas/PartialTranscript"
        - $ref: "#/components/schemas/FinalTranscript"
        - $ref: "#/components/schemas/SessionInformation"
        - $ref: "#/components/schemas/SessionTerminated"
        - $ref: "#/components/schemas/RealtimeError"
    RealtimeError:
      summary: Error message
      type: object
      x-fern-sdk-group-name: realtime
      additionalProperties: false
      required: [error]
      properties:
        error:
          type: string
      examples:
        - { "error": "Client sent audio too fast" }
    MessageType:
      type: string
      x-fern-sdk-group-name: realtime
      enum:
        [
          SessionBegins,
          PartialTranscript,
          FinalTranscript,
          SessionInformation,
          SessionTerminated,
        ]
    RealtimeTranscriptType:
      type: string
      x-fern-sdk-group-name: realtime
      enum: [PartialTranscript, FinalTranscript]
    RealtimeTranscript:
      x-fern-sdk-group-name: realtime
      oneOf:
        - $ref: "#/components/schemas/PartialTranscript"
        - $ref: "#/components/schemas/FinalTranscript"
      discriminator: "message_type"
    SessionBegins:
      summary: Session start
      x-fern-sdk-group-name: realtime
      allOf:
        - $ref: "#/components/schemas/RealtimeBaseMessage"
        - type: object
          required:
            - message_type
            - session_id
            - expires_at
          properties:
            message_type:
              description: Describes the type of the message
              type: string
              const: SessionBegins
            session_id:
              description: Unique identifier for the established session
              type: string
              format: uuid
            expires_at:
              description: Timestamp when this session will expire
              type: string
              pattern: '^(?:(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2}(?:\.\d+)?))$'
              x-fern-type: datetime
              x-ts-type: Date
          additionalProperties: false
      examples:
        - {
            session_id: "f14499a6-c399-4c30-b1eb-0a33af64b1d9",
            expires_at: "2023-11-04T16:51:38.316048",
            message_type: "SessionBegins",
          }
    SessionInformation:
      summary: Information about the session
      description: |
        Information about the session that is concluding.
        This message is sent at the end of the session, before the SessionTerminated message.
      x-fern-sdk-group-name: realtime
      allOf:
        - $ref: "#/components/schemas/RealtimeBaseMessage"
        - type: object
          required:
            - message_type
            - audio_duration_seconds
          properties:
            message_type:
              description: Describes the type of the message
              type: string
              const: SessionInformation
            audio_duration_seconds:
              description: The total duration of the audio in seconds
              type: number
              format: float
          additionalProperties: false
      examples:
        - {
            message_type: "SessionInformation",
            audio_duration_seconds: 232.192,
          }
    SessionTerminated:
      summary: Session terminated
      x-fern-sdk-group-name: realtime
      allOf:
        - $ref: "#/components/schemas/RealtimeBaseMessage"
        - type: object
          required:
            - message_type
          properties:
            message_type:
              description: Describes the type of the message
              type: string
              const: SessionTerminated
          additionalProperties: false
      examples:
        - { message_type: "SessionTerminated" }
    RealtimeBaseTranscript:
      type: object
      x-fern-sdk-group-name: realtime
      required:
        - audio_start
        - audio_end
        - confidence
        - text
        - words
        - created
      properties:
        audio_start:
          description: Start time of audio sample relative to session start, in milliseconds
          type: integer
        audio_end:
          description: End time of audio sample relative to session start, in milliseconds
          type: integer
        confidence:
          description: The confidence score of the entire transcription, between 0 and 1
          type: number
          minimum: 0
          maximum: 1
          format: double
        text:
          description: The partial transcript for your audio
          type: string
        words:
          description: |
            An array of objects, with the information for each word in the transcription text.
            Includes the start and end time of the word in milliseconds, the confidence score of the word, and the text, which is the word itself.
          type: array
          items:
            $ref: "#/components/schemas/Word"
        created:
          description: The timestamp for the partial transcript
          type: string
          pattern: '^(?:(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2}(?:\.\d+)?))$'
          x-fern-type: datetime
          x-ts-type: Date
      additionalProperties: false
    PartialTranscript:
      summary: As you send audio data to the API, the API immediately starts responding with Partial Transcript results.
      x-fern-sdk-group-name: realtime
      allOf:
        - $ref: "#/components/schemas/RealtimeBaseTranscript"
        - type: object
          required:
            - message_type
          properties:
            message_type:
              description: Describes the type of message
              type: string
              const: PartialTranscript
          additionalProperties: false
      examples:
        - {
            message_type: "PartialTranscript",
            created: "2023-11-03T17:14:13.854523",
            audio_start: 5310,
            audio_end: 7540,
            confidence: 0.59751355353605,
            text: "how can i show",
            words:
              [
                {
                  start: 7140,
                  end: 7175,
                  confidence: 0.466229424909777,
                  text: "how",
                },
                {
                  start: 7220,
                  end: 7255,
                  confidence: 0.432059008767216,
                  text: "can",
                },
                {
                  start: 7300,
                  end: 7335,
                  confidence: 0.994432034661841,
                  text: "i",
                },
                {
                  start: 7380,
                  end: 7415,
                  confidence: 0.497333745805364,
                  text: "show",
                },
              ],
          }
    FinalTranscript:
      summary: Transcript text at the end of an utterance with punctuation and casing.
      x-fern-sdk-group-name: realtime
      allOf:
        - $ref: "#/components/schemas/RealtimeBaseTranscript"
        - type: object
          required:
            - message_type
            - punctuated
            - text_formatted
          properties:
            message_type:
              description: Describes the type of message
              type: string
              const: FinalTranscript
            punctuated:
              description: Whether the text is punctuated and cased
              type: boolean
            text_formatted:
              description: Whether the text is formatted, for example Dollar -> $
              type: boolean
          additionalProperties: false
      examples:
        - {
            message_type: "FinalTranscript",
            created: "2023-11-03T17:14:35.00332",
            audio_start: 5310,
            audio_end: 25580,
            confidence: 0.853363490052184,
            text: "How can I show these two slides? So that demonstrates that the guide Cap, which for most of the last 3 million years has been the size of the lower 48 States, has shrunk by 40%. But this understates the seriousness of this particular problem because it doesn't show the sickness of the ice, the Oregon of the beating heart of the global climate system. It expands in winter and contracts against summer. The next slide I show you will be a rapid fast forward of what happened over the last 25 years. The permanentized is marked in red, and as you see, it expands to the dark wood. That's an annual.",
            words:
              [
                {
                  start: 7140,
                  end: 7175,
                  confidence: 0.466229424909777,
                  text: "How",
                },
                {
                  start: 7220,
                  end: 7255,
                  confidence: 0.432059008767216,
                  text: "can",
                },
                { start: 7300, end: 7335, confidence: 0.99, text: "I" },
                {
                  start: 7380,
                  end: 7415,
                  confidence: 0.497333745805364,
                  text: "show",
                },
                {
                  start: 7660,
                  end: 7695,
                  confidence: 0.999622368415665,
                  text: "these",
                },
                { start: 7700, end: 7735, confidence: 1, text: "two" },
                {
                  start: 7780,
                  end: 7895,
                  confidence: 0.950111441145432,
                  text: "slides?",
                },
                {
                  start: 8060,
                  end: 8095,
                  confidence: 0.865500654768006,
                  text: "So",
                },
                {
                  start: 8100,
                  end: 8135,
                  confidence: 0.736370866587664,
                  text: "that",
                },
                {
                  start: 8220,
                  end: 8535,
                  confidence: 0.704626940209106,
                  text: "demonstrates",
                },
                {
                  start: 8580,
                  end: 8615,
                  confidence: 0.993622467290789,
                  text: "that",
                },
              ],
            punctuated: true,
            text_formatted: true,
          }
    Word:
      type: object
      x-fern-sdk-group-name: realtime
      required:
        - start
        - end
        - confidence
        - text
      properties:
        start:
          description: Start time of the word in milliseconds
          type: integer
        end:
          description: End time of the word in milliseconds
          type: integer
        confidence:
          description: Confidence score of the word
          type: number
          minimum: 0
          maximum: 1
          format: double
        text:
          description: The word itself
          type: string
      additionalProperties: false
      examples:
        [
          {
            start: 7140,
            end: 7175,
            confidence: 0.466229424909777,
            text: "how",
          },
          {
            start: 7220,
            end: 7255,
            confidence: 0.432059008767216,
            text: "can",
          },
          { start: 7300, end: 7335, confidence: 0.994432034661841, text: "i" },
          {
            start: 7380,
            end: 7415,
            confidence: 0.497333745805364,
            text: "show",
          },
          {
            start: 7140,
            end: 7175,
            confidence: 0.466229424909777,
            text: "How",
          },
          {
            start: 7220,
            end: 7255,
            confidence: 0.432059008767216,
            text: "can",
          },
          { start: 7300, end: 7335, confidence: 0.99, text: "I" },
        ]
    AudioData:
      type: string
      description: Binary audio data
      format: binary
      x-fern-sdk-group-name: realtime
      x-ts-type: ArrayBufferLike
    ForceEndUtterance:
      description: Manually end an utterance
      x-fern-sdk-group-name: realtime
      type: object
      required:
        - force_end_utterance
      properties:
        force_end_utterance:
          description: A boolean value to communicate that you wish to force the end of the utterance
          type: boolean
      additionalProperties: false
      examples:
        - { "force_end_utterance": true }
    ConfigureEndUtteranceSilenceThreshold:
      description: Configure the threshold for how long to wait before ending an utterance. Default is 700ms.
      x-fern-sdk-group-name: realtime
      type: object
      required:
        - end_utterance_silence_threshold
      properties:
        end_utterance_silence_threshold:
          description: The duration threshold in milliseconds
          type: integer
          minimum: 0
          maximum: 20000
      additionalProperties: false
      examples:
        - { "end_utterance_silence_threshold": 300 }
    TerminateSession:
      summary: Terminate session
      x-fern-sdk-group-name: realtime
      type: object
      required:
        - terminate_session
      properties:
        terminate_session:
          description: Set to true to end your streaming session forever
          type: boolean
      additionalProperties: false
      examples:
        - { "terminate_session": true }
    AudioEncoding:
      type: string
      x-fern-sdk-group-name: realtime
      description: The encoding of the audio data
      default: pcm_s16le
      enum:
        - pcm_s16le
        - pcm_mulaw
      x-fern-enum:
        pcm_s16le:
          description: PCM signed 16-bit little-endian
          casing:
            camel: pcmS16le
            snake: pcm_s16le
            pascal: PcmS16le
            screamingSnake: PCM_S16LE
        pcm_mulaw:
          description: PCM Mu-law
          casing:
            camel: pcmMulaw
            snake: pcm_mulaw
            pascal: PcmMulaw
            screamingSnake: PCM_MULAW
      examples:
        - "pcm_s16le"
        - "pcm_mulaw"

  securitySchemes:
    ApiKey:
      description: Authenticate using your AssemblyAI API key
      type: httpApiKey
      in: header
      name: Authorization
    Token:
      description: "Authenticate using a [generated temporary token](https://www.assemblyai.com/docs/speech-to-text/streaming#authenticate-with-a-temporary-token)"
      type: httpApiKey
      in: query
      name: token