openapi: 3.1.0
info:
  title: Synth Task App Contract
  version: 1.0.0
  description: |
    # Task App Contract

    Task Apps are HTTP services that evaluate prompts for Synth's MIPRO and
    GEPA optimizers. Implement this contract in any language.

    ## How It Works

    ```
    ┌─────────────────┐         ┌──────────────────┐
    │  MIPRO/GEPA     │  HTTP   │  Your Task App   │
    │  Optimizer      │ ──────> │  (any language)  │
    │                 │         │                  │
    │  Proposes new   │         │  Evaluates the   │
    │  prompts        │ <────── │  prompt, returns │
    │                 │  reward │  reward          │
    └─────────────────┘         └──────────────────┘
    ```

    1. Optimizer generates candidate prompts
    2. Calls your `/rollout` endpoint with the prompt
    3. You evaluate it against your data and return a reward
    4. Optimizer uses rewards to find better prompts

    ## Required Endpoints

    - `GET /health` - Health check (unauthenticated OK)
    - `POST /rollout` - Evaluate a prompt (authenticated)

    ## Optional Endpoints

    - `GET /info` - Task metadata (authenticated)

    ## Quick Start

    1. Implement `/health` and `/rollout`
    2. Run locally: `./your-task-app --port 8001`
    3. Expose via tunnel:
       ```bash
       cloudflared tunnel --url http://localhost:8001
       # Returns URL like: https://random-words.trycloudflare.com
       ```
    4. Start optimization (see below)

    ## Running the Optimizer

    **Option A - Via CLI (requires Python):**
    ```bash
    uv tool install synth-ai
    synth prompt-learning run \
      --task-app-url https://your-tunnel.trycloudflare.com \
      --task-app-api-key your-env-key \
      --algorithm mipro
    ```

    **Option B - Via API (no Python needed):**
    ```bash
    curl -X POST https://api.usesynth.ai/api/prompt-learning/online/jobs \
      -H "Authorization: Bearer $SYNTH_API_KEY" \
      -H "Content-Type: application/json" \
      -d '{
        "algorithm": "mipro",
        "config_body": {
          "prompt_learning": {
            "task_app_url": "https://your-tunnel.trycloudflare.com",
            "task_app_api_key": "your-env-key"
          }
        }
      }'
    ```

    ## Authentication

    The optimizer sends `X-API-Key` header to `/rollout` (and `/info` if implemented).
    Match this against your `ENVIRONMENT_API_KEY` environment variable.

    `/health` MAY be unauthenticated for simpler "is the tunnel alive?" checks.

servers:
  - url: http://localhost:8001
    description: Local development

components:
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-API-Key
      description: |
        API key for authentication. The optimizer sends this if
        `task_app_api_key` is configured. Match against your
        `ENVIRONMENT_API_KEY` environment variable.

  schemas:
    # =========================================================================
    # REQUEST SCHEMAS
    # =========================================================================
    RolloutRequest:
      type: object
      description: |
        Request from optimizer to evaluate a prompt.

        ## What Your Service Should Do

        1. Extract `seed` from `env.seed` (or `env.config.seed`)
        2. Load your dataset sample at index: `sample = dataset[seed % len(dataset)]`
        3. Get the prompt from `policy.config.prompt_template.sections`
        4. Sort sections by `order` field
        5. Render each section: replace `{placeholders}` with your sample data
        6. POST to `policy.config.inference_url` + `/chat/completions`
        7. Parse the LLM response (tool_calls or content)
        8. Compare prediction to your ground truth
        9. Return RolloutResponse with reward in `metrics.mean_return`
      required:
        - run_id
        - env
        - policy
        - mode
      properties:
        run_id:
          type: string
          description: Unique ID for this rollout - echo it back in response

        env:
          $ref: '#/components/schemas/RolloutEnvSpec'

        policy:
          $ref: '#/components/schemas/RolloutPolicySpec'

        ops:
          type: array
          items: {}
          default: []
          description: Task-specific operations (can be ignored for most tasks)

        record:
          $ref: '#/components/schemas/RolloutRecordConfig'

        on_done:
          type: string
          default: "reset"
          description: Action when episode ends (usually "reset")

        safety:
          $ref: '#/components/schemas/RolloutSafetyConfig'

        training_session_id:
          type: string
          nullable: true
          description: ID of the training session (if applicable)

        synth_base_url:
          type: string
          nullable: true
          description: Base URL for Synth API callbacks

        mode:
          type: string
          enum:
            - rl
            - eval
          description: |
            Rollout mode. Task Apps MAY ignore this field.
            Synth uses `"eval"` for prompt optimization.
      example:
        run_id: "run_abc123"
        env:
          seed: 42
          config:
            split: "train"
        policy:
          policy_id: "policy_1"
          config:
            model: "gpt-4o-mini"
            inference_url: "https://api.usesynth.ai/v1/trial-xyz"
            temperature: 0.0
            max_completion_tokens: 512
            prompt_template:
              sections:
                - role: "system"
                  content: "You are a banking intent classifier."
                  order: 0
                - role: "user"
                  pattern: "Customer query: {query}\n\nClassify into one of: {intents}"
                  order: 1
        mode: "eval"

    RolloutEnvSpec:
      type: object
      description: Environment/task specification
      properties:
        env_id:
          type: string
          nullable: true
          description: Environment identifier (optional)

        env_name:
          type: string
          nullable: true
          description: Human-readable environment name (optional)

        config:
          type: object
          additionalProperties: true
          default: {}
          description: |
            Task-specific config. Common fields:
            - `seed`: alternative location for dataset index
            - `split`: "train" or "test"

        seed:
          type: integer
          nullable: true
          description: |
            INDEX into your dataset. Load the sample at this position.
            Use modulo: `index = seed % len(dataset)`

    RolloutPolicySpec:
      type: object
      description: Policy (model + prompt) specification
      properties:
        policy_id:
          type: string
          nullable: true
          description: Unique identifier for this policy

        policy_name:
          type: string
          nullable: true
          description: Human-readable policy name

        config:
          $ref: '#/components/schemas/PolicyConfig'

    PolicyConfig:
      type: object
      additionalProperties: true
      description: |
        Model and prompt configuration.

        This is a flexible dictionary - the fields below are common ones,
        but task apps may receive additional fields. Use `additionalProperties`
        to handle unknown fields gracefully.
      properties:
        model:
          type: string
          description: Model identifier (e.g., "gpt-4o-mini", "claude-3-5-sonnet")
          example: "gpt-4o-mini"

        provider:
          type: string
          description: Provider name (informational only)
          example: "openai"

        inference_url:
          type: string
          format: uri
          description: |
            BASE URL for LLM inference (WITHOUT `/chat/completions`).

            This URL is provided by the optimizer. Route ALL LLM requests
            through this URL - it handles API key injection, logging, and
            cost tracking.

            **Always append `/chat/completions` when making requests:**

            ```
            POST {inference_url}/chat/completions
            Content-Type: application/json

            {
              "model": "gpt-4o-mini",
              "messages": [...],
              "tools": [...],
              "tool_choice": "required"
            }
            ```
          example: "https://api.usesynth.ai/v1/trial-xyz"

        api_base:
          type: string
          format: uri
          description: Alternative to inference_url (same semantics - BASE URL)

        base_url:
          type: string
          format: uri
          description: Alternative to inference_url (same semantics - BASE URL)

        temperature:
          type: number
          default: 0.0
          description: Sampling temperature for LLM calls

        max_completion_tokens:
          type: integer
          default: 512
          description: Maximum tokens in LLM response

        max_tokens:
          type: integer
          description: Alternative to max_completion_tokens (same semantics)

        prompt_template:
          $ref: '#/components/schemas/PromptTemplate'

        tools:
          type: array
          items:
            $ref: '#/components/schemas/ToolDefinition'
          description: |
            Tool definitions to include in LLM request.
            If not provided, use your task-specific tools.

        tool_choice:
          oneOf:
            - type: string
              enum:
                - auto
                - required
                - none
            - type: object
              properties:
                type:
                  type: string
                  enum:
                    - function
                function:
                  type: object
                  properties:
                    name:
                      type: string
          description: Tool choice mode for LLM requests

    PromptTemplate:
      type: object
      additionalProperties: true
      description: |
        THE PROMPT BEING EVALUATED.

        Contains message sections to send to the LLM.
        Your job: render these with your sample data.

        **Field Naming:** The Python SDK serializes with `prompt_` prefixes
        (e.g., `prompt_template_id`, `prompt_sections`). Both conventions
        are supported - check for both when parsing:

        ```
        id = template.get("prompt_template_id") or template.get("id")
        sections = template.get("prompt_sections") or template.get("sections")
        ```
      properties:
        # Standard naming
        id:
          type: string
          description: Unique identifier for this prompt template

        name:
          type: string
          description: Human-readable name for this template

        sections:
          type: array
          items:
            $ref: '#/components/schemas/PromptSection'
          description: |
            Messages in the prompt. Sort by `order` before rendering.

            For each section, the `content` field contains the template
            string with `{placeholders}` to replace with your sample data.

            Example:
            ```
            content: "Classify this query: {query}"
            your_data: {"query": "How do I reset my PIN?"}
            rendered: "Classify this query: How do I reset my PIN?"
            ```

        variables:
          type: object
          additionalProperties:
            type: string
          description: Variable definitions ("required" or "optional")

        metadata:
          type: object
          additionalProperties: true
          description: Additional prompt metadata

        # Python SDK naming (alternative)
        prompt_template_id:
          type: string
          description: Alternative to `id` (Python SDK serialization)

        prompt_template_name:
          type: string
          description: Alternative to `name` (Python SDK serialization)

        prompt_sections:
          type: array
          items:
            $ref: '#/components/schemas/PromptSection'
          description: Alternative to `sections` (Python SDK serialization)

        prompt_variables:
          type: object
          additionalProperties:
            type: string
          description: Alternative to `variables` (Python SDK serialization)

        prompt_metadata:
          type: object
          additionalProperties: true
          description: Alternative to `metadata` (Python SDK serialization)

    PromptSection:
      type: object
      required:
        - role
      properties:
        name:
          type: string
          description: |
            Section name (e.g., "instruction", "examples", "constraints").
            Used by Python SDK for identification. Optional for polyglot implementations.

        role:
          type: string
          enum:
            - system
            - user
            - assistant
          description: Message role (system, user, or assistant)

        content:
          type: string
          description: |
            Template string with `{placeholders}` for variable substitution.
            This is the primary field used by the Python SDK.

            Example:
            ```
            content: "Classify the banking intent: {query}"
            ```

        pattern:
          type: string
          description: |
            Alternative name for `content` used by some task apps.
            Template with {placeholders} to replace with your sample data.
            Check both `content` and `pattern` when parsing.

        order:
          type: integer
          default: 0
          description: Sort sections by this before rendering into messages

    ToolDefinition:
      type: object
      description: OpenAI-compatible tool definition
      required:
        - type
        - function
      properties:
        type:
          type: string
          enum:
            - function
          default: "function"

        function:
          type: object
          required:
            - name
          properties:
            name:
              type: string
              description: Function name

            description:
              type: string
              description: What this function does

            parameters:
              type: object
              description: JSON Schema for function parameters
              additionalProperties: true

            strict:
              type: boolean
              default: false
              description: Enable strict mode for parameter validation

    RolloutRecordConfig:
      type: object
      description: Recording configuration for trajectories
      properties:
        trajectories:
          type: boolean
          default: true
          description: Whether to record full trajectories

        logprobs:
          type: boolean
          default: false
          description: Whether to record log probabilities

        value:
          type: boolean
          default: false
          description: Whether to record value estimates

        return_trace:
          type: boolean
          default: false
          description: Whether to return detailed trace

        trace_format:
          type: string
          enum:
            - compact
            - full
            - structured
          default: compact
          description: Format for trace output

    RolloutSafetyConfig:
      type: object
      description: Safety limits for rollout execution
      properties:
        max_ops:
          type: integer
          default: 100000
          description: Maximum operations allowed

        max_time_s:
          type: number
          default: 3600.0
          description: Maximum execution time in seconds

    # =========================================================================
    # RESPONSE SCHEMAS
    # =========================================================================
    RolloutResponse:
      type: object
      description: |
        Your response after evaluating the prompt.

        THE KEY FIELD IS `metrics.mean_return` - this is the reward
        that guides optimization. Higher = better prompt.
      required:
        - run_id
        - trajectories
        - metrics
      properties:
        run_id:
          type: string
          description: Echo the run_id from the request

        trajectories:
          type: array
          minItems: 1
          items:
            $ref: '#/components/schemas/RolloutTrajectory'
          description: |
            Execution trace. Must contain at least one trajectory
            with at least one step that includes a reward.

        branches:
          type: object
          additionalProperties:
            type: array
            items:
              type: string
          default: {}
          description: Branch information (for multi-branch tasks)

        metrics:
          $ref: '#/components/schemas/RolloutMetrics'

        aborted:
          type: boolean
          default: false
          description: Whether the rollout was aborted early

        ops_executed:
          type: integer
          default: 0
          description: Number of operations executed

        trace_correlation_id:
          type: string
          nullable: true
          description: Correlation ID for distributed tracing

        trace:
          type: object
          nullable: true
          additionalProperties: true
          description: Optional detailed trace for debugging

        pipeline_metadata:
          type: object
          additionalProperties: true
          default: {}
          description: Pipeline-specific metadata
      example:
        run_id: "run_abc123"
        trajectories:
          - env_id: "banking77::train::42"
            policy_id: "policy_1"
            steps:
              - obs:
                  query: "How do I reset my PIN?"
                  index: 42
                tool_calls:
                  - id: "call_1"
                    type: "function"
                    function:
                      name: "classify"
                      arguments: '{"intent": "change_pin"}'
                reward: 1.0
                done: true
                info:
                  expected: "change_pin"
                  predicted: "change_pin"
                  correct: true
            length: 1
            inference_url: "https://api.usesynth.ai/v1/trial-xyz"
        metrics:
          episode_returns:
            - 1.0
          mean_return: 1.0
          num_steps: 1
          num_episodes: 1
          outcome_score: 1.0
        aborted: false
        ops_executed: 1

    RolloutTrajectory:
      type: object
      description: A single trajectory (episode) of execution
      required:
        - env_id
        - policy_id
        - steps
        - length
        - inference_url
      properties:
        env_id:
          type: string
          description: |
            Identifier for this task instance.
            Recommend format: `{task_name}::{split}::{seed}`
          example: "banking77::train::42"

        policy_id:
          type: string
          description: Echo policy_id or policy_name from request

        steps:
          type: array
          minItems: 1
          items:
            $ref: '#/components/schemas/RolloutStep'
          description: |
            Execution steps. Must contain at least one step with a reward.

        final:
          type: object
          additionalProperties: true
          nullable: true
          description: Final state information (optional)

        length:
          type: integer
          minimum: 1
          description: Number of steps in this trajectory

        inference_url:
          type: string
          description: The inference URL used for LLM calls

        decision_samples:
          type: array
          items: {}
          nullable: true
          description: Sampled decisions for analysis (optional)

    RolloutStep:
      type: object
      description: A single step in the trajectory
      required:
        - obs
        - tool_calls
        - done
      properties:
        obs:
          type: object
          additionalProperties: true
          description: |
            The input data for this step. Include your sample data
            here for debugging/logging purposes.

        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/ToolCall'
          description: |
            Tool calls from LLM response.
            Can be empty array if using content-based responses.

        reward:
          type: number
          nullable: true
          description: |
            THE REWARD FOR THIS STEP.

            For classification tasks:
            - 1.0 = prediction matches ground truth
            - 0.0 = prediction does not match

            For other tasks, use appropriate continuous scores.
            Can be null for intermediate steps in multi-step tasks.

        done:
          type: boolean
          description: Whether the episode is complete (usually true for single-step tasks)

        truncated:
          type: boolean
          nullable: true
          description: Whether the episode was truncated early

        info:
          type: object
          additionalProperties: true
          nullable: true
          description: |
            Debug info. Recommend including:
            - expected: ground truth value
            - predicted: model's prediction
            - correct: boolean

    ToolCall:
      type: object
      description: A tool call from the LLM response
      properties:
        id:
          type: string
          description: Unique identifier for this tool call

        type:
          type: string
          default: "function"
          description: Type of tool call (always "function")

        function:
          type: object
          properties:
            name:
              type: string
              description: Name of the function called

            arguments:
              type: string
              description: JSON string of function arguments

    RolloutMetrics:
      type: object
      description: Aggregated metrics for the rollout
      required:
        - episode_returns
        - mean_return
        - num_steps
      properties:
        episode_returns:
          type: array
          items:
            type: number
          description: List of rewards (usually `[reward]` for single-step tasks)

        mean_return:
          type: number
          description: |
            THE OPTIMIZATION TARGET.

            This is what MIPRO/GEPA maximizes. For classification:
            - 1.0 = prompt produced correct answer
            - 0.0 = prompt produced wrong answer

            For batch evaluation, this is the average reward.

        num_steps:
          type: integer
          description: Total number of steps across all trajectories

        num_episodes:
          type: integer
          default: 1
          description: Number of episodes (usually 1)

        outcome_score:
          type: number
          nullable: true
          description: Same as mean_return for most tasks

        events_score:
          type: number
          nullable: true
          description: Score based on events (for event-driven tasks)

        details:
          type: object
          additionalProperties: true
          description: Additional metric details

    # =========================================================================
    # INFO ENDPOINT SCHEMAS
    # =========================================================================
    TaskInfo:
      type: object
      additionalProperties: true
      description: |
        Task metadata returned by `/info` endpoint.
        Matches Python SDK's `TaskInfo` model structure.
      required:
        - task
        - environment
        - dataset
        - inference
      properties:
        task:
          $ref: '#/components/schemas/TaskDescriptor'

        environment:
          type: string
          description: Environment identifier (e.g., "banking77", "crafter")

        dataset:
          $ref: '#/components/schemas/DatasetInfo'

        rubric:
          $ref: '#/components/schemas/RubricInfo'

        inference:
          $ref: '#/components/schemas/InferenceInfo'

        limits:
          $ref: '#/components/schemas/LimitsInfo'

        task_metadata:
          type: object
          additionalProperties: true
          description: Task-specific extras (e.g., prompt version info, documentation links)

    TaskDescriptor:
      type: object
      additionalProperties: true
      description: Human-readable task identifiers
      required:
        - id
        - name
      properties:
        id:
          type: string
          description: Unique task identifier

        name:
          type: string
          description: Human-readable task name

        description:
          type: string
          nullable: true
          description: Task description

        version:
          type: string
          nullable: true
          description: Task version

    DatasetInfo:
      type: object
      additionalProperties: true
      description: Metadata about the dataset powering the environment
      properties:
        id:
          type: string
          nullable: true
          description: Dataset identifier

        name:
          type: string
          nullable: true
          description: Human-readable dataset name

        version:
          type: string
          nullable: true
          description: Dataset version

        splits:
          type: array
          items:
            type: string
          nullable: true
          description: Available data splits (e.g., ["train", "test"])

        default_split:
          type: string
          nullable: true
          description: Default split to use

        description:
          type: string
          nullable: true
          description: Dataset description

    RubricInfo:
      type: object
      additionalProperties: true
      description: Outcome and event scoring definitions used by judges
      properties:
        outcome:
          $ref: '#/components/schemas/RubricSection'

        events:
          $ref: '#/components/schemas/RubricSection'

    RubricSection:
      type: object
      additionalProperties: true
      properties:
        name:
          type: string
          description: Section name

        criteria:
          type: array
          items:
            $ref: '#/components/schemas/RubricCriterion'
          default: []

    RubricCriterion:
      type: object
      additionalProperties: true
      required:
        - id
        - description
      properties:
        id:
          type: string
          description: Criterion identifier

        description:
          type: string
          description: Criterion description

        weight:
          type: number
          nullable: true
          description: Weight for scoring

    InferenceInfo:
      type: object
      additionalProperties: true
      description: Recommended defaults for policy model routing
      properties:
        model:
          type: string
          nullable: true
          description: Recommended model identifier

        inference_url:
          type: string
          nullable: true
          description: Recommended inference URL

    LimitsInfo:
      type: object
      additionalProperties: true
      description: Operational limits the environment enforces
      properties:
        max_turns:
          type: integer
          nullable: true
          description: Maximum turns per episode

        max_response_tokens:
          type: integer
          nullable: true
          description: Maximum response tokens

        timeout_seconds:
          type: integer
          nullable: true
          description: Timeout for each rollout in seconds

    # =========================================================================
    # COMMON SCHEMAS
    # =========================================================================
    HealthResponse:
      type: object
      description: Health check response
      required:
        - healthy
      properties:
        healthy:
          type: boolean
          description: Whether the service is healthy

        auth:
          type: object
          description: Authentication info (optional)
          properties:
            required:
              type: boolean
              description: Whether auth is required for /rollout

            expected_prefix:
              type: string
              description: First few chars of expected API key (for debugging)

    Error:
      type: object
      description: Error response
      required:
        - detail
      properties:
        detail:
          type: string
          description: Human-readable error message

paths:
  # ===========================================================================
  # HEALTH ENDPOINT (unauthenticated - for simple tunnel checks)
  # ===========================================================================
  /health:
    get:
      summary: Health check
      operationId: healthCheck
      description: |
        Basic health check. This endpoint MAY be unauthenticated to allow
        simple "is the tunnel alive?" checks.

        If you require auth, the optimizer will send `X-API-Key` here too.
      tags:
        - Health
      responses:
        '200':
          description: Service is healthy
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HealthResponse'
              example:
                healthy: true
        '503':
          description: Service unavailable
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "Service is starting up"

  # ===========================================================================
  # INFO ENDPOINT (optional, authenticated)
  # ===========================================================================
  /info:
    get:
      summary: Task metadata (optional)
      operationId: getTaskInfo
      description: |
        Returns information about the task, dataset, and capabilities.
        This endpoint is OPTIONAL - implement it if you want the optimizer
        to discover your task's metadata.
      tags:
        - Info
      security:
        - ApiKeyAuth: []
      responses:
        '200':
          description: Task info
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TaskInfo'
              example:
                task:
                  id: "banking77"
                  name: "Banking77 Intent Classification"
                  description: "Classify banking customer queries into 77 intent categories"
                  version: "1.0.0"
                environment: "banking77"
                dataset:
                  id: "banking77"
                  name: "Banking77"
                  splits:
                    - "train"
                    - "test"
                  default_split: "train"
                rubric:
                  outcome:
                    name: "intent_accuracy"
                    criteria:
                      - id: "correct_intent"
                        description: "Correctly classify the customer query"
                        weight: 1.0
                inference:
                  model: "gpt-4o-mini"
                limits:
                  max_turns: 1
                  timeout_seconds: 30
                task_metadata:
                  format: "tool_call"
                  tool_name: "banking77_classify"
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "Invalid or missing API key"

  # ===========================================================================
  # ROLLOUT ENDPOINT (required, authenticated)
  # ===========================================================================
  /rollout:
    post:
      summary: Evaluate a prompt
      operationId: executeRollout
      description: |
        THE CORE ENDPOINT.

        The optimizer calls this repeatedly with different prompts.
        You evaluate each prompt and return a reward.

        ## Implementation Checklist

        1. [ ] Parse the RolloutRequest
        2. [ ] Extract seed: `request.env.seed` or `request.env.config.seed`
        3. [ ] Load your sample: `dataset[seed % len(dataset)]`
        4. [ ] Get prompt sections: `request.policy.config.prompt_template.sections`
        5. [ ] Sort sections by `order` field
        6. [ ] Render each section: replace `{placeholders}` with your sample data
        7. [ ] Build messages array for LLM
        8. [ ] POST to `request.policy.config.inference_url` + `/chat/completions`
        9. [ ] Parse LLM response (tool_calls or content)
        10. [ ] Compare prediction to ground truth
        11. [ ] Return RolloutResponse with `metrics.mean_return` = reward

        ## Example: Classification Task

        Your dataset:
        ```json
        [
          {"query": "How do I reset my PIN?", "label": "change_pin"},
          {"query": "Card not arriving", "label": "card_arrival"},
          ...
        ]
        ```

        Request comes in with `seed=0`, prompt_template with sections:
        - system: "You are a classifier..."
        - user (pattern): "Query: {query}\nClassify using the tool."

        You:
        1. Load `dataset[0]` = `{"query": "How do I reset my PIN?", "label": "change_pin"}`
        2. Render: "Query: How do I reset my PIN?\nClassify using the tool."
        3. POST to `inference_url/chat/completions` with messages + your tool schema
        4. LLM returns tool_call with `{"intent": "change_pin"}`
        5. Compare: "change_pin" == "change_pin" -> correct!
        6. Return: `metrics.mean_return = 1.0`
      tags:
        - Rollout
      security:
        - ApiKeyAuth: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RolloutRequest'
      responses:
        '200':
          description: Rollout completed successfully
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/RolloutResponse'
        '400':
          description: Bad request (invalid seed, missing fields, etc.)
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "Invalid seed: 99999 exceeds dataset size"
        '401':
          description: Unauthorized (missing or invalid API key)
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "Invalid or missing API key"
        '500':
          description: Internal server error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "Internal error: failed to load dataset"
        '502':
          description: Upstream LLM error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Error'
              example:
                detail: "LLM returned error: rate limit exceeded"

tags:
  - name: Health
    description: Health check endpoint
  - name: Info
    description: Task metadata endpoint (optional)
  - name: Rollout
    description: Core prompt evaluation endpoint