arazzo: 1.0.1 info: title: Hyperbolic Generate And Describe Image summary: Render an image with diffusion, then describe it with a vision LLM and narrate the caption. description: >- A full multimodal loop across three Hyperbolic endpoints. A diffusion model renders a base64 image from a prompt, a vision-capable chat model inspects that image (passed as a data URI) and writes a caption, and the caption is then narrated by the text-to-speech endpoint. Every step inlines its request and inline Authorization Bearer credential so the flow reads and runs without opening the OpenAPI sources. version: 1.0.0 sourceDescriptions: - name: imageGenerationApi url: ../openapi/hyperbolic-image-generation-api-openapi.yml type: openapi - name: chatCompletionsApi url: ../openapi/hyperbolic-chat-completions-api-openapi.yml type: openapi - name: audioGenerationApi url: ../openapi/hyperbolic-audio-generation-api-openapi.yml type: openapi workflows: - workflowId: generate-and-describe-image summary: Generate an image, caption it with a vision model, and narrate the caption. description: >- Renders an image from a prompt, feeds the returned base64 image as a data URI to a vision chat model for captioning, then converts the caption to speech audio. inputs: type: object required: - apiKey - imageModel - imagePrompt - visionModel properties: apiKey: type: string description: Hyperbolic API key passed as a Bearer token. imageModel: type: string description: Image model name (e.g. SDXL1.0-base). imagePrompt: type: string description: The text-to-image prompt to render. visionModel: type: string description: >- Vision-capable chat model id (e.g. meta-llama/Llama-3.2-90B-Vision-Instruct). language: type: string description: TTS language code for narration. speed: type: number description: Speech speed between 0.5 and 2.0. steps: - stepId: render description: >- Generate a single image from the supplied prompt and capture its base64 bytes. operationId: generateImage parameters: - name: Authorization in: header value: "Bearer $inputs.apiKey" requestBody: contentType: application/json payload: model_name: $inputs.imageModel prompt: $inputs.imagePrompt n: 1 successCriteria: - condition: $statusCode == 200 outputs: image: $response.body#/images/0/image seed: $response.body#/images/0/seed - stepId: caption description: >- Pass the generated image to a vision model as a data URI and ask for a one sentence caption. operationId: createChatCompletion parameters: - name: Authorization in: header value: "Bearer $inputs.apiKey" requestBody: contentType: application/json payload: model: $inputs.visionModel messages: - role: user content: - type: text text: Write a single descriptive sentence captioning this image. - type: image_url image_url: url: "data:image/png;base64,$steps.render.outputs.image" max_tokens: 200 temperature: 0.5 stream: false successCriteria: - condition: $statusCode == 200 outputs: caption: $response.body#/choices/0/message/content - stepId: narrate description: >- Convert the generated caption to speech audio and capture the base64 audio and its duration. operationId: generateAudio parameters: - name: Authorization in: header value: "Bearer $inputs.apiKey" requestBody: contentType: application/json payload: text: $steps.caption.outputs.caption language: $inputs.language speed: $inputs.speed successCriteria: - condition: $statusCode == 200 outputs: audio: $response.body#/audio duration: $response.body#/duration outputs: image: $steps.render.outputs.image caption: $steps.caption.outputs.caption audio: $steps.narrate.outputs.audio