openapi: 3.1.0
info:
  title: NVIDIA NIM Speech API
  description: >
    NVIDIA Riva-powered speech NIMs. Exposes automatic speech recognition
    (Parakeet, Canary), neural machine translation, and text-to-speech
    (Magpie-TTS, FastPitch) through HTTP surfaces. (Production deployments
    typically use the gRPC adapter for streaming; this OpenAPI documents the
    REST surface exposed by the hosted endpoint.)
  version: '2026-05-25'
  contact:
    name: NVIDIA Developer Support
    url: https://forums.developer.nvidia.com/c/ai-data-science/nemo-llm-service/
  license:
    name: NVIDIA AI Enterprise License
    url: https://www.nvidia.com/en-us/data-center/products/ai-enterprise/
servers:
  - url: https://integrate.api.nvidia.com
    description: NVIDIA-hosted NIM endpoint
  - url: http://localhost:8000
    description: Self-hosted NIM container default
security:
  - BearerAuth: []
tags:
  - name: ASR
    description: Automatic speech recognition (speech-to-text)
  - name: TTS
    description: Text-to-speech synthesis
paths:
  /v1/audio/transcriptions:
    post:
      summary: Transcribe Audio
      description: Transcribe a WAV/FLAC/MP3 audio clip into text using a Riva ASR NIM (e.g. Parakeet, Canary).
      operationId: createTranscription
      tags:
        - ASR
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              required: [file, model]
              properties:
                file:
                  type: string
                  format: binary
                model:
                  type: string
                  example: nvidia/parakeet-ctc-1.1b-asr
                language:
                  type: string
                  default: en-US
                response_format:
                  type: string
                  enum: [json, text, srt, vtt]
                  default: json
                temperature:
                  type: number
      responses:
        '200':
          description: Transcription result.
          content:
            application/json:
              schema:
                type: object
                properties:
                  text:
                    type: string
                  language:
                    type: string
                  duration:
                    type: number
        '400':
          description: Invalid audio or model.
        '401':
          description: Missing or invalid API key.
  /v1/audio/speech:
    post:
      summary: Synthesize Speech
      description: Generate WAV/MP3 audio from text using a Riva TTS NIM (e.g. Magpie-TTS, FastPitch).
      operationId: createSpeech
      tags:
        - TTS
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [model, input, voice]
              properties:
                model:
                  type: string
                  example: nvidia/magpie-tts
                input:
                  type: string
                  description: Text to synthesize.
                voice:
                  type: string
                  example: en-US.Female-1
                response_format:
                  type: string
                  enum: [mp3, wav, opus, flac]
                  default: mp3
                speed:
                  type: number
                  default: 1.0
      responses:
        '200':
          description: Synthesized audio bytes.
          content:
            audio/mpeg:
              schema:
                type: string
                format: binary
            audio/wav:
              schema:
                type: string
                format: binary
        '400':
          description: Invalid request.
        '401':
          description: Missing or invalid API key.
components:
  securitySchemes:
    BearerAuth:
      type: http
      scheme: bearer
      bearerFormat: nvapi-...