arazzo: 1.0.1
info:
  title: Hyperbolic Generate And Describe Image
  summary: Render an image with diffusion, then describe it with a vision LLM and narrate the caption.
  description: >-
    A full multimodal loop across three Hyperbolic endpoints. A diffusion model
    renders a base64 image from a prompt, a vision-capable chat model inspects
    that image (passed as a data URI) and writes a caption, and the caption is
    then narrated by the text-to-speech endpoint. Every step inlines its request
    and inline Authorization Bearer credential so the flow reads and runs without
    opening the OpenAPI sources.
  version: 1.0.0
sourceDescriptions:
- name: imageGenerationApi
  url: ../openapi/hyperbolic-image-generation-api-openapi.yml
  type: openapi
- name: chatCompletionsApi
  url: ../openapi/hyperbolic-chat-completions-api-openapi.yml
  type: openapi
- name: audioGenerationApi
  url: ../openapi/hyperbolic-audio-generation-api-openapi.yml
  type: openapi
workflows:
- workflowId: generate-and-describe-image
  summary: Generate an image, caption it with a vision model, and narrate the caption.
  description: >-
    Renders an image from a prompt, feeds the returned base64 image as a data
    URI to a vision chat model for captioning, then converts the caption to
    speech audio.
  inputs:
    type: object
    required:
    - apiKey
    - imageModel
    - imagePrompt
    - visionModel
    properties:
      apiKey:
        type: string
        description: Hyperbolic API key passed as a Bearer token.
      imageModel:
        type: string
        description: Image model name (e.g. SDXL1.0-base).
      imagePrompt:
        type: string
        description: The text-to-image prompt to render.
      visionModel:
        type: string
        description: >-
          Vision-capable chat model id (e.g.
          meta-llama/Llama-3.2-90B-Vision-Instruct).
      language:
        type: string
        description: TTS language code for narration.
      speed:
        type: number
        description: Speech speed between 0.5 and 2.0.
  steps:
  - stepId: render
    description: >-
      Generate a single image from the supplied prompt and capture its base64
      bytes.
    operationId: generateImage
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.apiKey"
    requestBody:
      contentType: application/json
      payload:
        model_name: $inputs.imageModel
        prompt: $inputs.imagePrompt
        n: 1
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      image: $response.body#/images/0/image
      seed: $response.body#/images/0/seed
  - stepId: caption
    description: >-
      Pass the generated image to a vision model as a data URI and ask for a one
      sentence caption.
    operationId: createChatCompletion
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.apiKey"
    requestBody:
      contentType: application/json
      payload:
        model: $inputs.visionModel
        messages:
        - role: user
          content:
          - type: text
            text: Write a single descriptive sentence captioning this image.
          - type: image_url
            image_url:
              url: "data:image/png;base64,$steps.render.outputs.image"
        max_tokens: 200
        temperature: 0.5
        stream: false
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      caption: $response.body#/choices/0/message/content
  - stepId: narrate
    description: >-
      Convert the generated caption to speech audio and capture the base64 audio
      and its duration.
    operationId: generateAudio
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.apiKey"
    requestBody:
      contentType: application/json
      payload:
        text: $steps.caption.outputs.caption
        language: $inputs.language
        speed: $inputs.speed
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      audio: $response.body#/audio
      duration: $response.body#/duration
  outputs:
    image: $steps.render.outputs.image
    caption: $steps.caption.outputs.caption
    audio: $steps.narrate.outputs.audio