openapi: 3.1.0
info:
  title: Mistral AI Audio Transcription API
  description: >-
    Audio transcription API powered by Voxtral for converting speech to text.
    Supports speaker diarization, word-level timestamps, context biasing,
    and real-time streaming across 13 languages.
  version: '1.0'
  contact:
    name: Mistral AI Support
    url: https://docs.mistral.ai/
    email: support@mistral.ai
  termsOfService: https://mistral.ai/terms/
externalDocs:
  description: Mistral AI Audio Transcription API Documentation
  url: https://docs.mistral.ai/api/endpoint/audio/transcriptions
servers:
  - url: https://api.mistral.ai/v1
    description: Mistral AI Production
tags:
  - name: Audio
    description: Audio transcription operations
security:
  - bearerAuth: []
paths:
  /audio/transcriptions:
    post:
      operationId: createTranscription
      summary: Mistral AI Create an audio transcription
      description: >-
        Transcribe audio to text using Voxtral. Supports multiple audio formats,
        speaker diarization, word-level timestamps, context biasing with custom
        vocabulary, and 13 languages.
      tags:
        - Audio
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/TranscriptionRequest'
      responses:
        '200':
          description: Transcription result
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TranscriptionResponse'
        '400':
          description: Bad request
        '401':
          description: Unauthorized
        '429':
          description: Rate limit exceeded
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      description: Mistral AI API key passed as a Bearer token
  schemas:
    TranscriptionRequest:
      type: object
      required:
        - model
        - file
      properties:
        model:
          type: string
          description: The transcription model to use
          examples:
            - mistral-audio-latest
        file:
          type: string
          format: binary
          description: The audio file to transcribe (WAV, MP3, FLAC, OGG, M4A, WEBM)
        language:
          type: string
          description: >-
            ISO 639-1 language code. If not specified, the language is
            auto-detected.
          examples:
            - en
            - fr
            - de
            - es
        prompt:
          type: string
          description: >-
            Context or vocabulary hints to improve transcription accuracy.
            Useful for domain-specific terminology.
        response_format:
          type: string
          enum:
            - json
            - text
            - verbose_json
          default: json
          description: Output format for the transcription
        temperature:
          type: number
          minimum: 0
          maximum: 1
          default: 0
          description: Sampling temperature for transcription
        timestamp_granularities:
          type: array
          items:
            type: string
            enum:
              - word
              - segment
          description: Granularity levels for timestamps
        diarization:
          type: boolean
          default: false
          description: Whether to enable speaker diarization
    TranscriptionResponse:
      type: object
      properties:
        text:
          type: string
          description: The full transcribed text
        language:
          type: string
          description: Detected or specified language code
        duration:
          type: number
          description: Duration of the audio in seconds
        words:
          type: array
          items:
            $ref: '#/components/schemas/Word'
          description: Word-level details when verbose_json format is used
        segments:
          type: array
          items:
            $ref: '#/components/schemas/Segment'
          description: Segment-level details when verbose_json format is used
    Word:
      type: object
      properties:
        word:
          type: string
          description: The transcribed word
        start:
          type: number
          description: Start time in seconds
        end:
          type: number
          description: End time in seconds
        speaker:
          type:
            - integer
            - 'null'
          description: Speaker ID if diarization is enabled
    Segment:
      type: object
      properties:
        id:
          type: integer
          description: Segment index
        start:
          type: number
          description: Start time in seconds
        end:
          type: number
          description: End time in seconds
        text:
          type: string
          description: Transcribed text for this segment
        speaker:
          type:
            - integer
            - 'null'
          description: Speaker ID if diarization is enabled