openapi: 3.1.0
info:
  title: Eval Hub
  description: API REST server for evaluation backend orchestration
  version: 0.4.4
  contact:
    name: API Support
  license:
    name: Apache 2.0
    url: https://www.apache.org/licenses/LICENSE-2.0
servers:
  - url: http://localhost:8080
    description: Local development server
security: []
tags:
  - name: Evaluations
    description: Evaluation job management endpoints
  - name: Collections
    description: Benchmark collection management endpoints
  - name: Providers
    description: Evaluation provider endpoints
  - name: Health
    description: Health check endpoints
paths:
  /api/v1/health:
    get:
      summary: Health Check
      description: Health check endpoint.
      operationId: get_health
      tags:
        - Health
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HealthResponse'
              examples:
                healthy:
                  summary: Healthy service response
                  value:
                    status: healthy
                    version: N.N.N
                    timestamp: '2026-05-27T18:42:11Z'
                    git_hash: abc1234
                    uptime: 86400000000000
                degraded:
                  summary: Degraded service response
                  value:
                    status: degraded
                    version: N.N.N
                    timestamp: '2026-05-27T18:42:11Z'
                    git_hash: abc1234
                    uptime: 172800000000000
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/jobs:
    post:
      tags:
        - Evaluations
      summary: Create Evaluation
      description: Create and execute evaluation request using the simplified benchmark schema.
      operationId: post_evaluations_jobs
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EvaluationJobConfig'
            examples:
              CreateEvaluationWithBenchmarks:
                summary: Evaluate a model with inline benchmarks
                value:
                  name: granite-3.1-8b-safety-eval
                  description: Safety and reasoning evaluation for Granite 3.1 8B Instruct
                  tags:
                    - nightly
                    - granite
                  model:
                    url: http://llm-service.models.svc.cluster.local:8000/v1
                    name: granite-3.1-8b-instruct
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      weight: 0.6
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
                      parameters:
                        num_fewshot: 0
                        limit: 100
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.4
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
                  pass_criteria:
                    threshold: 0.5
              CreateEvaluationWithCollection:
                summary: Evaluate a model using a benchmark collection
                value:
                  name: llama-3.2-standard-eval
                  description: Standard evaluation suite for Llama 3.2 3B
                  tags:
                    - release-candidate
                  model:
                    url: http://llm-service.models.svc.cluster.local:8000/v1
                    name: llama-3.2-3b
                  collection:
                    id: standard-safety-collection
                  pass_criteria:
                    threshold: 0.5
              CreateEvaluationWithAuthAndQueue:
                summary: Evaluate an authenticated model with Kueue scheduling
                value:
                  name: granite-3.1-8b-gpu-eval
                  description: GPU-scheduled evaluation with model authentication
                  tags:
                    - gpu
                    - kueue
                  model:
                    url: http://llm-service.models.svc.cluster.local:8000/v1
                    name: granite-3.1-8b-instruct
                    auth:
                      secret_ref: llm-api-key
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      parameters:
                        num_fewshot: 0
                        limit: 100
                  pass_criteria:
                    threshold: 0.5
                  queue:
                    kind: kueue
                    name: gpu-local-queue
              CreateEvaluationWithExperiment:
                summary: Evaluate with MLFlow experiment tracking
                value:
                  name: llama-3.2-nightly-eval
                  description: Nightly evaluation tracked in MLFlow
                  tags:
                    - nightly
                    - mlflow
                  model:
                    url: http://llm-service.models.svc.cluster.local:8000/v1
                    name: llama-3.2-3b
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      weight: 0.6
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      parameters:
                        num_fewshot: 0
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.4
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                  pass_criteria:
                    threshold: 0.5
                  experiment:
                    name: nightly-eval-experiment
                    tags:
                      - key: team
                        value: model-evaluation
                      - key: pipeline
                        value: nightly
      responses:
        '202':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationJobResource'
              examples:
                response:
                  summary: Evaluation job accepted and queued
                  value:
                    resource:
                      id: a1b2c3d4-5678-9abc-def0-1234567890ab
                      tenant: default
                      created_at: '2026-01-15T09:30:00Z'
                      updated_at: '2026-01-15T09:30:00Z'
                      owner: user@example.com
                    status:
                      state: pending
                      message:
                        message: Evaluation job created.
                        message_code: evaluation_job_created
                    name: granite-3.1-8b-safety-eval
                    description: Safety and reasoning evaluation for Granite 3.1 8B Instruct
                    tags:
                      - nightly
                      - granite
                    model:
                      url: http://llm-service.models.svc.cluster.local:8000/v1
                      name: granite-3.1-8b-instruct
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.6
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                        parameters:
                          num_fewshot: 0
                          limit: 100
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.4
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
                    pass_criteria:
                      threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    get:
      tags:
        - Evaluations
      summary: List Evaluations
      description: List all evaluation requests.
      operationId: get_evaluations_jobs
      parameters:
        - name: limit
          in: query
          required: false
          schema:
            type: integer
            maximum: 100
            minimum: 1
            description: Maximum number of evaluations to return
            default: 50
            title: Limit
          description: Maximum number of evaluations to return
        - name: offset
          in: query
          required: false
          schema:
            type: integer
            minimum: 0
            description: Offset for pagination
            default: 0
            title: Offset
          description: Offset for pagination
        - name: status
          in: query
          required: false
          schema:
            type: string
            description: Filter by status
            title: Status Filter
          description: Filter by status
        - name: name
          in: query
          required: false
          schema:
            type: string
            title: Name
          description: Name to search for
        - name: tags
          in: query
          required: false
          schema:
            type: string
            title: Tags
          description: Tags to search for
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationJobResourceList'
              examples:
                response:
                  summary: Paginated list of evaluation jobs
                  value:
                    first:
                      href: /api/v1/evaluations/jobs?limit=50&offset=0
                    next:
                      href: /api/v1/evaluations/jobs?limit=50&offset=50
                    limit: 50
                    total_count: 73
                    items:
                      - resource:
                          id: a1b2c3d4-5678-9abc-def0-1234567890ab
                          tenant: default
                          created_at: '2026-01-15T09:30:00Z'
                          updated_at: '2026-01-15T09:42:15Z'
                          owner: user@example.com
                        status:
                          state: completed
                          message:
                            message: Evaluation job completed.
                            message_code: evaluation_job_updated
                        results:
                          benchmarks:
                            - id: arc_easy
                              provider_id: lm_evaluation_harness
                              benchmark_index: 0
                              metrics:
                                acc: 0.82
                                acc_norm: 0.85
                              test:
                                primary_score: 0.85
                                threshold: 0.25
                                pass: true
                          test:
                            score: 0.85
                            threshold: 0.5
                            pass: true
                        name: granite-3.1-8b-safety-eval
                        model:
                          url: http://llm-service.models.svc.cluster.local:8000/v1
                          name: granite-3.1-8b-instruct
                        benchmarks:
                          - id: arc_easy
                            provider_id: lm_evaluation_harness
                            weight: 0.6
                        pass_criteria:
                          threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/jobs/{id}:
    get:
      tags:
        - Evaluations
      summary: Get Evaluation
      description: Returns the evaluation job resource with the current status and results.
      operationId: get_evaluations_jobs_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Id
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluationJobResource'
              examples:
                response:
                  summary: Completed evaluation job with benchmark results
                  value:
                    resource:
                      id: a1b2c3d4-5678-9abc-def0-1234567890ab
                      tenant: default
                      created_at: '2026-01-15T09:30:00Z'
                      updated_at: '2026-01-15T09:42:15Z'
                      owner: user@example.com
                    status:
                      state: completed
                      message:
                        message: Evaluation job completed.
                        message_code: evaluation_job_updated
                      benchmarks:
                        - provider_id: lm_evaluation_harness
                          id: arc_easy
                          benchmark_index: 0
                          status: completed
                          started_at: '2026-01-15T09:31:00Z'
                          completed_at: '2026-01-15T09:38:45Z'
                        - provider_id: garak
                          id: owasp_llm_top10
                          benchmark_index: 1
                          status: completed
                          started_at: '2026-01-15T09:31:00Z'
                          completed_at: '2026-01-15T09:42:15Z'
                    results:
                      benchmarks:
                        - id: arc_easy
                          provider_id: lm_evaluation_harness
                          benchmark_index: 0
                          metrics:
                            acc: 0.82
                            acc_norm: 0.85
                          mlflow_run_id: run-7f3a1b2c
                          logs_path: /data/logs/a1b2c3d4.log
                          test:
                            primary_score: 0.85
                            threshold: 0.25
                            pass: true
                        - id: owasp_llm_top10
                          provider_id: garak
                          benchmark_index: 1
                          metrics:
                            attack_success_rate: 0.12
                          mlflow_run_id: run-9e8d7c6b
                          logs_path: /data/logs/a1b2c3d4-garak.log
                          test:
                            primary_score: 0.12
                            threshold: 0.3
                            pass: true
                      test:
                        score: 0.85
                        threshold: 0.5
                        pass: true
                    name: granite-3.1-8b-safety-eval
                    description: Safety and reasoning evaluation for Granite 3.1 8B Instruct
                    tags:
                      - nightly
                      - granite
                    model:
                      url: http://llm-service.models.svc.cluster.local:8000/v1
                      name: granite-3.1-8b-instruct
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.6
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                        parameters:
                          num_fewshot: 0
                          limit: 100
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.4
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
                    pass_criteria:
                      threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    delete:
      tags:
        - Evaluations
      summary: Cancel Evaluation
      description: Cancel a running evaluation.
      operationId: delete_evaluations_jobs_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Id
        - name: hard_delete
          in: query
          required: false
          schema:
            type: boolean
            description: If `true`, delete the evaluation job permanently so that `GET /api/v1/evaluations/jobs/{id}` will return a 404.
            default: false
            title: Hard Delete
      responses:
        '204':
          description: Successful Response
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
        '409':
          $ref: '#/components/responses/Conflict'
  /api/v1/evaluations/jobs/{id}/logs:
    get:
      tags:
        - Evaluations
      summary: Get Evaluation Job Logs
      description: |
        Returns plain-text workload logs for all benchmarks in an evaluation job.

        **Kubernetes runtime:** adapter container stdout/stderr via the Kubernetes API.
        **Local runtime:** contents of each benchmark's `jobrun.log` file under
        `/tmp/evalhub-jobs/{job_id}/{benchmark_index}/{provider_id}/{benchmark_id}/`.
        Logs are fetched on demand from the active runtime. Distinct from `logs_path` on
        benchmark results, which refers to adapter-written artifact files.
      operationId: get_evaluations_jobs_id_logs
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Id
        - name: tail_lines
          in: query
          required: false
          schema:
            type: integer
            minimum: 1
            maximum: 10000
            default: 1000
            description: |
              Maximum number of log lines to return per benchmark. The response
              concatenates one section per benchmark; each section is capped
              independently, not as a total across the full response.
        - name: timestamps
          in: query
          required: false
          schema:
            type: boolean
            default: false
            description: Include Kubernetes log timestamps
        - name: previous
          in: query
          required: false
          schema:
            type: boolean
            default: false
            description: Return logs from the previous terminated container instance
        - name: since_seconds
          in: query
          required: false
          schema:
            type: integer
            minimum: 1
            description: Only return logs newer than this many seconds
      responses:
        '200':
          description: Successful Response
          content:
            text/plain:
              schema:
                type: string
              examples:
                response:
                  summary: Plain-text logs for all benchmarks in a job
                  value: |
                    === pod=a1b2c3d4-405ef22a-abc12 container=adapter benchmark_id=arc_easy ===
                    INFO starting evaluation
                    INFO benchmark completed
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/jobs/{id}/benchmarks/{benchmark_index}/logs:
    get:
      tags:
        - Evaluations
      summary: Get Evaluation Benchmark Logs
      description: |
        Returns plain-text workload logs for a single benchmark within an evaluation job.
        The benchmark is identified by `benchmark_index` in the request path.

        **Kubernetes runtime:** adapter container stdout/stderr via the Kubernetes API.
        **Local runtime:** contents of the benchmark's `jobrun.log` file. See
        `GET /api/v1/evaluations/jobs/{id}/logs` for shared query parameters.
      operationId: get_evaluations_jobs_id_benchmarks_benchmark_index_logs
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Id
        - name: benchmark_index
          in: path
          required: true
          schema:
            type: integer
            minimum: 0
            title: Benchmark Index
        - name: tail_lines
          in: query
          required: false
          schema:
            type: integer
            minimum: 1
            maximum: 10000
            default: 1000
            description: Maximum number of log lines to return
        - name: timestamps
          in: query
          required: false
          schema:
            type: boolean
            default: false
            description: Include Kubernetes log timestamps
        - name: previous
          in: query
          required: false
          schema:
            type: boolean
            default: false
            description: Return logs from the previous terminated container instance
        - name: since_seconds
          in: query
          required: false
          schema:
            type: integer
            minimum: 1
            description: Only return logs newer than this many seconds
      responses:
        '200':
          description: Successful Response
          content:
            text/plain:
              schema:
                type: string
              examples:
                response:
                  summary: Plain-text logs for one benchmark
                  value: |
                    INFO starting evaluation
                    INFO benchmark completed
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/providers:
    get:
      tags:
        - Providers
      summary: List Providers
      description: List all registered evaluation providers.
      operationId: list_providers
      parameters:
        - name: limit
          in: query
          required: false
          schema:
            type: integer
            maximum: 100
            minimum: 1
            description: Maximum number of providers to return
            default: 50
            title: Limit
          description: Maximum number of providers to return
        - name: offset
          in: query
          required: false
          schema:
            type: integer
            minimum: 0
            description: Offset for pagination
            default: 0
            title: Offset
          description: Offset for pagination
        - name: benchmarks
          in: query
          required: false
          schema:
            type: boolean
            default: true
            description: Set to false to not include benchmarks supported by this provider in the response
            title: Benchmarks
          description: Include or exclude benchmarks supported by this provider in the response
        - name: name
          in: query
          required: false
          schema:
            type: string
            title: Name
          description: Name to search for
        - name: tags
          in: query
          required: false
          schema:
            type: string
            title: Tags
          description: Tags to search for
        - name: scope
          in: query
          required: false
          schema:
            type: string
            enum:
              - system
              - tenant
            title: Scope of providers
          description: |
            Set to `system` to get only system defined providers, or `tenant` to get only user defined providers. If `scope` is not provided, both system and user defined providers will be returned.
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProviderResourceList'
              examples:
                response:
                  summary: Paginated list of evaluation providers
                  value:
                    first:
                      href: /api/v1/evaluations/providers?limit=50&offset=0
                    limit: 50
                    total_count: 3
                    items:
                      - resource:
                          id: b3f1a2c4-1234-5678-abcd-ef0123456789
                          tenant: default
                          created_at: '2025-10-01T00:00:00Z'
                          updated_at: '2025-10-01T00:00:00Z'
                        name: lm_evaluation_harness
                        title: LM Evaluation Harness
                        description: Comprehensive evaluation framework for language models with 180 benchmarks
                        tags:
                          - reasoning
                          - science
                          - lm_eval
                        runtime:
                          k8s:
                            image: quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2
                            entrypoint:
                              - /opt/app-root/bin/python
                              - /opt/app-root/src/main.py
                            cpu_request: 100m
                            memory_request: 128Mi
                            cpu_limit: 500m
                            memory_limit: 4Gi
                        benchmarks:
                          - id: arc_easy
                            name: Basic science Q&A
                            description: Grade-school science questions testing basic reasoning and scientific knowledge (AI2 Reasoning Challenge, easy split).
                            category: reasoning
                            metrics:
                              - acc
                              - acc_norm
                            num_few_shot: 0
                            dataset_size: 2376
                            tags:
                              - reasoning
                              - science
                              - lm_eval
                            primary_score:
                              metric: acc_norm
                              lower_is_better: false
                            pass_criteria:
                              threshold: 0.25
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    post:
      tags:
        - Providers
      summary: Create a new provider scoped to the current tenant (Bring Your Own Provider)
      description: Create a new provider scoped to the current tenant (Bring Your Own Provider)
      operationId: post_providers
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ProviderConfig'
            examples:
              request:
                summary: Register a custom evaluation provider (BYOP)
                value:
                  name: my-custom-evaluator
                  title: Custom Internal Evaluator
                  description: Internal evaluation adapter for domain-specific benchmarks
                  tags:
                    - custom
                    - internal
                  runtime:
                    k8s:
                      image: registry.internal.example.com/eval/custom-adapter:v1.2
                      entrypoint:
                        - /opt/app-root/bin/python
                        - /opt/app-root/src/main.py
                      cpu_request: 250m
                      memory_request: 512Mi
                      cpu_limit: '1'
                      memory_limit: 2Gi
                  benchmarks:
                    - id: domain-qa
                      name: Domain Q&A Accuracy
                      description: Measures accuracy on domain-specific question answering
                      category: reasoning
                      metrics:
                        - acc
                        - f1
                      primary_score:
                        metric: acc
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.5
              LocalRuntimeProvider:
                summary: Register a provider with a local runtime
                value:
                  name: local-eval-harness
                  title: Local Evaluation Harness
                  description: Local-mode provider for development and testing
                  tags:
                    - local
                    - development
                  runtime:
                    local:
                      command: python /opt/eval/run_benchmark.py
                      env:
                        - name: LOG_LEVEL
                          value: debug
                  benchmarks:
                    - id: custom-qa
                      name: Custom Q&A
                      description: Domain-specific question answering benchmark
                      category: reasoning
                      metrics:
                        - acc
                      primary_score:
                        metric: acc
                        lower_is_better: false
              GPUProvider:
                summary: Register a GPU-enabled provider
                value:
                  name: gpu-lm-eval
                  title: GPU LM Evaluation Harness
                  description: LM Evaluation Harness adapter with GPU support for large models
                  tags:
                    - gpu
                    - lm_eval
                  runtime:
                    k8s:
                      image: quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2
                      entrypoint:
                        - /opt/app-root/bin/python
                        - /opt/app-root/src/main.py
                      cpu_request: 500m
                      memory_request: 1Gi
                      cpu_limit: '2'
                      memory_limit: 8Gi
                      gpu:
                        resource: nvidia.com/gpu
                        count: 1
                        node_selector:
                          nvidia.com/gpu.product: A100-SXM4-40GB
                  benchmarks:
                    - id: arc_easy
                      name: Basic science Q&A
                      description: Grade-school science questions testing basic reasoning
                      category: reasoning
                      metrics:
                        - acc
                        - acc_norm
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
        required: true
      responses:
        '201':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProviderResource'
              examples:
                response:
                  summary: Newly registered provider
                  value:
                    resource:
                      id: c4d5e6f7-8901-2345-bcde-f67890123456
                      tenant: default
                      created_at: '2026-01-20T10:00:00Z'
                      updated_at: '2026-01-20T10:00:00Z'
                      owner: user@example.com
                    name: my-custom-evaluator
                    title: Custom Internal Evaluator
                    description: Internal evaluation adapter for domain-specific benchmarks
                    tags:
                      - custom
                      - internal
                    runtime:
                      k8s:
                        image: registry.internal.example.com/eval/custom-adapter:v1.2
                        entrypoint:
                          - /opt/app-root/bin/python
                          - /opt/app-root/src/main.py
                        cpu_request: 250m
                        memory_request: 512Mi
                        cpu_limit: '1'
                        memory_limit: 2Gi
                    benchmarks:
                      - id: domain-qa
                        name: Domain Q&A Accuracy
                        description: Measures accuracy on domain-specific question answering
                        category: reasoning
                        metrics:
                          - acc
                          - f1
                        primary_score:
                          metric: acc
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/providers/{id}:
    get:
      tags:
        - Providers
      summary: Get Provider
      description: Get a provider by ID.
      operationId: get_providers_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            description: Provider ID
            title: Provider Id
          description: Provider ID
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProviderResource'
              examples:
                response:
                  summary: Garak vulnerability scanning provider
                  value:
                    resource:
                      id: d8e9f0a1-2345-6789-cdef-012345678901
                      tenant: default
                      created_at: '2025-10-01T00:00:00Z'
                      updated_at: '2025-10-01T00:00:00Z'
                    name: garak
                    title: Garak
                    description: LLM vulnerability scanner and red-teaming framework
                    tags:
                      - security
                      - red_team
                    runtime:
                      k8s:
                        image: quay.io/trustyai/trustyai-garak-lls-provider-dsp:latest
                        entrypoint:
                          - python
                          - '-m'
                          - llama_stack_provider_trustyai_garak.evalhub
                        cpu_request: 500m
                        memory_request: 512Mi
                        cpu_limit: 2000m
                        memory_limit: 4Gi
                    benchmarks:
                      - id: owasp_llm_top10
                        name: OWASP LLM top 10 risk scan
                        description: Tests against the top 10 security risks specific to LLM applications.
                        category: security
                        metrics:
                          - attack_success_rate
                        tags:
                          - security
                          - owasp
                          - red_team
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
                      - id: quality
                        name: Toxic & harmful content scan
                        description: Scans for violence, profanity, toxicity, hate speech, and integrity issues.
                        category: safety
                        metrics:
                          - attack_success_rate
                        tags:
                          - safety
                          - quality
                          - toxicity
                          - red_team
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    put:
      tags:
        - Providers
      summary: Update Provider
      description: Update an existing provider.
      operationId: put_providers_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            description: Provider ID
            title: Provider Id
          description: Provider ID
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ProviderConfig'
            examples:
              request:
                summary: Update a custom provider with a new image version
                value:
                  name: my-custom-evaluator
                  title: Custom Internal Evaluator
                  description: Updated evaluation adapter with improved tokenization
                  tags:
                    - custom
                    - internal
                  runtime:
                    k8s:
                      image: registry.internal.example.com/eval/custom-adapter:v2.0
                      entrypoint:
                        - /opt/app-root/bin/python
                        - /opt/app-root/src/main.py
                      cpu_request: 500m
                      memory_request: 1Gi
                      cpu_limit: '2'
                      memory_limit: 4Gi
                  benchmarks:
                    - id: domain-qa
                      name: Domain Q&A Accuracy
                      description: Measures accuracy on domain-specific question answering
                      category: reasoning
                      metrics:
                        - acc
                        - f1
                      primary_score:
                        metric: acc
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.5
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProviderResource'
              examples:
                response:
                  summary: Updated provider resource
                  value:
                    resource:
                      id: c4d5e6f7-8901-2345-bcde-f67890123456
                      tenant: default
                      created_at: '2026-01-20T10:00:00Z'
                      updated_at: '2026-02-05T14:30:00Z'
                    name: my-custom-evaluator
                    title: Custom Internal Evaluator
                    description: Updated evaluation adapter with improved tokenization
                    tags:
                      - custom
                      - internal
                    runtime:
                      k8s:
                        image: registry.internal.example.com/eval/custom-adapter:v2.0
                        entrypoint:
                          - /opt/app-root/bin/python
                          - /opt/app-root/src/main.py
                        cpu_request: 500m
                        memory_request: 1Gi
                        cpu_limit: '2'
                        memory_limit: 4Gi
                    benchmarks:
                      - id: domain-qa
                        name: Domain Q&A Accuracy
                        description: Measures accuracy on domain-specific question answering
                        category: reasoning
                        metrics:
                          - acc
                          - f1
                        primary_score:
                          metric: acc
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    patch:
      tags:
        - Providers
      summary: Patch Provider
      description: Partially update an existing provider.
      operationId: patch_providers_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Provider Id
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: array
              title: Json Patch
              description: JSON Patch operation
              items:
                $ref: '#/components/schemas/PatchOperation'
            examples:
              request:
                summary: Update the container image version
                value:
                  - op: replace
                    path: /runtime/k8s/image
                    value: registry.internal.example.com/eval/custom-adapter:v2.1
                  - op: replace
                    path: /description
                    value: Updated evaluation adapter with bug fixes
              AddTags:
                summary: Add tags to a provider
                value:
                  - op: add
                    path: /tags
                    value:
                      - gpu
                      - production
              UpdateRuntime:
                summary: Replace the local runtime command
                value:
                  - op: replace
                    path: /runtime/local/command
                    value: python /opt/eval/run_benchmark_v2.py
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ProviderResource'
              examples:
                response:
                  summary: Provider after applying patch operations
                  value:
                    resource:
                      id: c4d5e6f7-8901-2345-bcde-f67890123456
                      tenant: default
                      created_at: '2026-01-20T10:00:00Z'
                      updated_at: '2026-02-06T09:15:00Z'
                    name: my-custom-evaluator
                    title: Custom Internal Evaluator
                    description: Updated evaluation adapter with bug fixes
                    tags:
                      - custom
                      - internal
                    runtime:
                      k8s:
                        image: registry.internal.example.com/eval/custom-adapter:v2.1
                        entrypoint:
                          - /opt/app-root/bin/python
                          - /opt/app-root/src/main.py
                        cpu_request: 500m
                        memory_request: 1Gi
                        cpu_limit: '2'
                        memory_limit: 4Gi
                    benchmarks:
                      - id: domain-qa
                        name: Domain Q&A Accuracy
                        description: Measures accuracy on domain-specific question answering
                        category: reasoning
                        metrics:
                          - acc
                          - f1
                        primary_score:
                          metric: acc
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.5
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    delete:
      tags:
        - Providers
      summary: Delete Provider
      description: Delete provider by ID.
      operationId: delete_providers_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            description: Provider ID
            title: Provider Id
          description: Provider ID
      responses:
        '204':
          description: Successful Response
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/collections:
    get:
      tags:
        - Collections
      summary: List Collections
      description: List all benchmark collections.
      operationId: get_collections
      parameters:
        - name: limit
          in: query
          required: false
          schema:
            type: integer
            maximum: 100
            minimum: 1
            description: Maximum number of collections to return
            default: 50
            title: Limit
          description: Maximum number of collections to return
        - name: offset
          in: query
          required: false
          schema:
            type: integer
            minimum: 0
            description: Offset for pagination
            default: 0
            title: Offset
          description: Offset for pagination
        - name: name
          in: query
          required: false
          schema:
            type: string
            title: Name
          description: Name to search for
        - name: category
          in: query
          required: false
          schema:
            type: string
            title: Category
          description: Category to search for
        - name: tags
          in: query
          required: false
          schema:
            type: string
            title: Tags
          description: Tags to search for
        - name: scope
          in: query
          required: false
          schema:
            type: string
            enum:
              - system
              - tenant
            title: Scope of collections
          description: |
            Set to `system` to get only system defined collections, or `tenant` to get only user defined collections. If `scope` is not provided, both system and user defined collections will be returned.
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionResourceList'
              examples:
                response:
                  summary: Paginated list of benchmark collections
                  value:
                    first:
                      href: /api/v1/evaluations/collections?limit=50&offset=0
                    limit: 50
                    total_count: 2
                    items:
                      - resource:
                          id: e5f6a7b8-9012-3456-cdef-0123456789ab
                          tenant: default
                          created_at: '2025-12-01T10:00:00Z'
                          updated_at: '2025-12-01T10:00:00Z'
                        name: llm-safety-suite
                        category: safety
                        description: Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning
                        tags:
                          - safety
                          - nightly
                        pass_criteria:
                          threshold: 0.5
                        benchmarks:
                          - id: arc_easy
                            provider_id: lm_evaluation_harness
                            weight: 0.6
                            primary_score:
                              metric: acc_norm
                              lower_is_better: false
                            pass_criteria:
                              threshold: 0.25
                          - id: owasp_llm_top10
                            provider_id: garak
                            weight: 0.4
                            primary_score:
                              metric: attack_success_rate
                              lower_is_better: true
                            pass_criteria:
                              threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    post:
      tags:
        - Collections
      summary: Create Collection
      description: Create a new collection.
      operationId: post_collections
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CollectionConfig'
            examples:
              request:
                summary: Create a safety evaluation collection
                value:
                  name: release-gate-safety
                  category: safety
                  description: Release-gate collection combining reasoning and red-teaming benchmarks
                  tags:
                    - release-gate
                    - safety
                  pass_criteria:
                    threshold: 0.5
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      weight: 0.6
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
                      parameters:
                        num_fewshot: 0
                        limit: 100
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.4
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
              MinimalCollection:
                summary: Create a minimal collection
                value:
                  name: quick-reasoning-check
                  category: reasoning
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
              MultiBenchmarkCollection:
                summary: Create a multi-benchmark collection with per-benchmark thresholds
                value:
                  name: comprehensive-safety-gate
                  category: safety
                  description: Comprehensive safety gate combining multiple Garak scans
                  tags:
                    - safety
                    - release-gate
                  pass_criteria:
                    threshold: 0.5
                  benchmarks:
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.4
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
                    - id: quality
                      provider_id: garak
                      weight: 0.3
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
                    - id: intents
                      provider_id: garak
                      weight: 0.3
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
        required: true
      responses:
        '201':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionResource'
              examples:
                response:
                  summary: Newly created collection
                  value:
                    resource:
                      id: f6a7b8c9-0123-4567-def0-123456789abc
                      tenant: default
                      created_at: '2026-02-01T09:00:00Z'
                      updated_at: '2026-02-01T09:00:00Z'
                      owner: user@example.com
                    name: release-gate-safety
                    category: safety
                    description: Release-gate collection combining reasoning and red-teaming benchmarks
                    tags:
                      - release-gate
                      - safety
                    pass_criteria:
                      threshold: 0.5
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.6
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                        parameters:
                          num_fewshot: 0
                          limit: 100
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.4
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
  /api/v1/evaluations/collections/{id}:
    get:
      tags:
        - Collections
      summary: Get Collection
      description: Get details of a specific collection.
      operationId: get_collections_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Collection Id
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionResource'
              examples:
                response:
                  summary: Safety evaluation collection with benchmarks
                  value:
                    resource:
                      id: e5f6a7b8-9012-3456-cdef-0123456789ab
                      tenant: default
                      created_at: '2025-12-01T10:00:00Z'
                      updated_at: '2025-12-01T10:00:00Z'
                    name: llm-safety-suite
                    category: safety
                    description: Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning
                    tags:
                      - safety
                      - nightly
                    pass_criteria:
                      threshold: 0.5
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.6
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.4
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    put:
      tags:
        - Collections
      summary: Update Collection
      description: Update an existing collection.
      operationId: put_collections_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Collection Id
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CollectionConfig'
            examples:
              request:
                summary: Update collection to add a quality benchmark
                value:
                  name: llm-safety-suite
                  category: safety
                  description: Safety evaluation with reasoning, OWASP risks, and content quality
                  tags:
                    - safety
                    - nightly
                    - updated
                  pass_criteria:
                    threshold: 0.5
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      weight: 0.5
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.3
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
                    - id: quality
                      provider_id: garak
                      weight: 0.2
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
              UpdateWithDifferentProviders:
                summary: Revise collection with benchmarks from multiple providers
                value:
                  name: llm-safety-suite
                  category: safety
                  description: Revised safety suite with lm-eval and Garak benchmarks
                  tags:
                    - safety
                    - nightly
                  pass_criteria:
                    threshold: 0.5
                  benchmarks:
                    - id: arc_easy
                      provider_id: lm_evaluation_harness
                      weight: 0.4
                      primary_score:
                        metric: acc_norm
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
                    - id: blimp
                      provider_id: lm_evaluation_harness
                      weight: 0.2
                      primary_score:
                        metric: acc
                        lower_is_better: false
                      pass_criteria:
                        threshold: 0.25
                    - id: owasp_llm_top10
                      provider_id: garak
                      weight: 0.4
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionResource'
              examples:
                response:
                  summary: Updated collection with three benchmarks
                  value:
                    resource:
                      id: e5f6a7b8-9012-3456-cdef-0123456789ab
                      tenant: default
                      created_at: '2025-12-01T10:00:00Z'
                      updated_at: '2026-02-10T11:00:00Z'
                    name: llm-safety-suite
                    category: safety
                    description: Safety evaluation with reasoning, OWASP risks, and content quality
                    tags:
                      - safety
                      - nightly
                      - updated
                    pass_criteria:
                      threshold: 0.5
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.5
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.3
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
                      - id: quality
                        provider_id: garak
                        weight: 0.2
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    patch:
      tags:
        - Collections
      summary: Patch Collection
      description: Partially update an existing collection.
      operationId: patch_collections_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Collection Id
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: array
              title: Json Patch
              description: JSON Patch operation
              items:
                $ref: '#/components/schemas/PatchOperation'
            examples:
              request:
                summary: Raise the overall pass threshold
                value:
                  - op: replace
                    path: /pass_criteria/threshold
                    value: 0.6
                  - op: replace
                    path: /description
                    value: Safety evaluation with stricter pass threshold
              ReplaceBenchmark:
                summary: Replace a benchmark in the collection
                value:
                  - op: replace
                    path: /benchmarks/0
                    value:
                      id: avid
                      provider_id: garak
                      weight: 0.5
                      primary_score:
                        metric: attack_success_rate
                        lower_is_better: true
                      pass_criteria:
                        threshold: 0.3
              UpdateNameAndTags:
                summary: Rename collection and update tags
                value:
                  - op: replace
                    path: /name
                    value: llm-safety-suite-v2
                  - op: replace
                    path: /tags
                    value:
                      - safety
                      - nightly
                      - v2
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionResource'
              examples:
                response:
                  summary: Collection after raising the pass threshold
                  value:
                    resource:
                      id: e5f6a7b8-9012-3456-cdef-0123456789ab
                      tenant: default
                      created_at: '2025-12-01T10:00:00Z'
                      updated_at: '2026-02-10T12:30:00Z'
                    name: llm-safety-suite
                    category: safety
                    description: Safety evaluation with stricter pass threshold
                    tags:
                      - safety
                      - nightly
                    pass_criteria:
                      threshold: 0.6
                    benchmarks:
                      - id: arc_easy
                        provider_id: lm_evaluation_harness
                        weight: 0.6
                        primary_score:
                          metric: acc_norm
                          lower_is_better: false
                        pass_criteria:
                          threshold: 0.25
                      - id: owasp_llm_top10
                        provider_id: garak
                        weight: 0.4
                        primary_score:
                          metric: attack_success_rate
                          lower_is_better: true
                        pass_criteria:
                          threshold: 0.3
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
    delete:
      tags:
        - Collections
      summary: Delete Collection
      description: Delete a collection.
      operationId: delete_collections_id
      parameters:
        - name: id
          in: path
          required: true
          schema:
            type: string
            title: Collection Id
      responses:
        '204':
          description: Success
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/Forbidden'
        '404':
          $ref: '#/components/responses/NotFound'
components:
  schemas:
    HealthResponse:
      type: object
      description: Health check response
      properties:
        status:
          type: string
          description: Overall status (e.g. healthy, degraded)
        version:
          type: string
          description: Service version
        timestamp:
          type: string
          format: date-time
          description: Server timestamp
        git_hash:
          type: string
          description: Commit hash
        uptime:
          type: integer
          description: Uptime in nanoseconds (or use string for duration)
      required:
        - status
        - version
        - timestamp
        - uptime
    Error:
      type: object
      description: Error response
      properties:
        message_code:
          type: string
          description: Machine-readable error code
        message:
          type: string
          description: Human-readable message
        trace:
          type: string
          description: Request trace or debug info
      required:
        - message_code
        - message
    HRef:
      type: object
      description: Hypermedia reference
      properties:
        href:
          type: string
          description: URI reference
    Page:
      type: object
      description: Generic pagination schema
      properties:
        first:
          $ref: '#/components/schemas/HRef'
          description: Link to first page
        next:
          $ref: '#/components/schemas/HRef'
          description: Link to next page (omit if none)
        limit:
          type: integer
          description: Page size
        total_count:
          type: integer
          description: Total number of items
    Resource:
      type: object
      description: Base resource fields
      properties:
        id:
          type: string
          description: Unique identifier
        tenant:
          type: string
          description: Tenant for scoping
        created_at:
          type: string
          format: date-time
          description: Creation timestamp
        updated_at:
          type: string
          format: date-time
          description: Last update timestamp
        owner:
          type: string
          description: Owner of the resource
    EvaluationResource:
      type: object
      description: Evaluation resource (base fields for an evaluation)
      allOf:
        - $ref: '#/components/schemas/Resource'
        - type: object
          properties:
            mlflow_experiment_id:
              type: string
              description: MLFlow experiment ID
    OverallState:
      type: string
      enum:
        - pending
        - running
        - completed
        - failed
        - cancelled
        - partially_failed
      description: Overall evaluation job state
    MessageInfo:
      type: object
      description: Message from a downstream service
      properties:
        message:
          type: string
          description: Human-readable message
        message_code:
          type: string
          description: Machine-readable code
      required:
        - message
        - message_code
    EvaluationJobState:
      type: object
      description: Overall evaluation job state and message
      required:
        - state
        - message
      properties:
        state:
          $ref: '#/components/schemas/OverallState'
        message:
          $ref: '#/components/schemas/MessageInfo'
    State:
      type: string
      enum:
        - pending
        - running
        - completed
        - failed
        - cancelled
      description: Evaluation or benchmark state
    JobPhase:
      type: string
      description: Current execution phase of the benchmark
      enum:
        - initializing
        - loading_data
        - running_evaluation
        - post_processing
        - persisting_artifacts
        - completed
    BenchmarkStatus:
      type: object
      description: Status of an individual benchmark in an evaluation
      properties:
        provider_id:
          type: string
          description: Provider ID
        id:
          type: string
          description: Benchmark ID
        benchmark_index:
          type: integer
          description: Benchmark index in the evaluation job request
        status:
          $ref: '#/components/schemas/State'
        phase:
          $ref: '#/components/schemas/JobPhase'
        error_message:
          $ref: '#/components/schemas/MessageInfo'
        warning_message:
          $ref: '#/components/schemas/MessageInfo'
        started_at:
          type: string
          format: date-time
          description: RFC3339 start time
        completed_at:
          type: string
          format: date-time
          description: RFC3339 completion time
    EvaluationJobStatus:
      type: object
      description: Current status of an evaluation job
      allOf:
        - $ref: '#/components/schemas/EvaluationJobState'
        - type: object
          properties:
            benchmarks:
              type: array
              items:
                $ref: '#/components/schemas/BenchmarkStatus'
              description: Per-benchmark status
    BenchmarkTest:
      type: object
      description: The test result of a single benchmark run
      properties:
        primary_score:
          type: number
          format: float
          description: Primary score value
        threshold:
          type: number
          format: float
          description: Threshold value
        pass:
          type: boolean
          description: Whether the benchmark passed
    BenchmarkResult:
      type: object
      description: Result of a single benchmark run
      properties:
        id:
          type: string
          description: Benchmark ID
        provider_id:
          type: string
          description: Provider ID
        benchmark_index:
          type: integer
          description: Benchmark index in the evaluation job request
        metrics:
          type: object
          additionalProperties: true
          description: Metric name to value
        additional_info:
          type: object
          additionalProperties: true
          description: Additional benchmark information
        artifacts:
          type: object
          additionalProperties: true
          description: Artifact key to location/info
        mlflow_run_id:
          type: string
          description: MLFlow run ID
        logs_path:
          type: string
          description: Path to logs
        test:
          $ref: '#/components/schemas/BenchmarkTest'
          description: Test result
    EvaluationTest:
      type: object
      description: The test result for an evaluation
      properties:
        score:
          type: number
          format: float
          description: Evaluation score value
        threshold:
          type: number
          format: float
          description: Threshold value
        pass:
          type: boolean
          description: Whether the entire evaluation passed
    EvaluationJobResults:
      type: object
      description: Results section for an evaluation job
      properties:
        benchmarks:
          type: array
          items:
            $ref: '#/components/schemas/BenchmarkResult'
          description: Per-benchmark results
        mlflow_experiment_url:
          type: string
          description: URL to MLFlow experiment
        test:
          $ref: '#/components/schemas/EvaluationTest'
          description: Test result
    ModelAuth:
      type: object
      description: The model authentication configuration
      properties:
        secret_ref:
          type: string
          description: The reference to the secret containing the model authentication credentials
    ModelRef:
      type: object
      description: Model specification for evaluation requests
      required:
        - url
        - name
      properties:
        url:
          type: string
          description: Model URL
        name:
          type: string
          description: Model name
        parameters:
          type: object
          additionalProperties: true
          description: Model specific parameters
        auth:
          $ref: '#/components/schemas/ModelAuth'
          description: The model authentication configuration
    EvaluationJobConfigCommon1:
      type: object
      title: EvaluationJobConfigCommon1
      description: |
        The evaluation job request configuration.
      required:
        - name
        - model
      properties:
        name:
          type: string
          description: |
            The evaluation job name.
        description:
          type: string
          description: |
            The evaluation job description.
        tags:
          type: array
          items:
            type: string
          description: |
            The evaluation job tags.
        model:
          $ref: '#/components/schemas/ModelRef'
          description: |
            The model to evaluate.
    Ref:
      type: object
      description: Reference by ID
      required:
        - id
      properties:
        id:
          type: string
          description: Unique identifier
    PrimaryScore:
      type: object
      description: Primary score metric configuration
      properties:
        metric:
          type: string
          description: Metric name
        lower_is_better:
          type: boolean
          default: false
          description: Whether lower values indicate better performance
    PassCriteria:
      type: object
      description: Pass/fail threshold criteria.
      properties:
        threshold:
          type: number
          format: float
          description: Threshold value.
      required:
        - threshold
    HardwareProfileRef:
      type: object
      title: HardwareProfileRef
      description: Reference to an OpenDataHub HardwareProfile custom resource.
      required:
        - name
      properties:
        name:
          type: string
          pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
          description: |
            Name of the `HardwareProfile` custom resource (`hardwareprofiles.infrastructure.opendatahub.io`). Must conform to RFC 1123 DNS label naming (lowercase letters, digits, and internal hyphens).
          examples:
            - default-profile
            - gpu-profile-v1
        namespace:
          type: string
          pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
          description: |
            Namespace of the `HardwareProfile` custom resource. When omitted, the server defaults to the evaluation job tenant namespace (from the `X-Tenant` request header in cluster mode).
          examples:
            - my-tenant-namespace
    BenchmarkHardwareConfig:
      type: object
      title: BenchmarkHardwareConfig
      description: |
        Optional per-benchmark hardware override for Kubernetes runtimes. When `hardware_profile_ref` is set, the server fetches the named OpenDataHub `HardwareProfile` custom resource and applies its CPU, memory, and GPU resource values to the evaluation Job adapter container. Values missing from the profile fall back to the provider `runtime.k8s` configuration (and built-in defaults when those are unset).
      properties:
        hardware_profile_ref:
          $ref: '#/components/schemas/HardwareProfileRef'
    TestDataRefS3:
      type: object
      description: |
        S3 (or S3-compatible) test data. Only applies when Eval Hub runs the benchmark as a
        **Kubernetes Job** (cluster execution). It does not apply to local-only evaluation runs.

        The test-data **init container** loads the Secret named **`secret_ref`** from the
        job namespace. It expects these keys. **All values must be non-empty** or the init
        container fails:

        - **`AWS_ACCESS_KEY_ID`** — access key
        - **`AWS_SECRET_ACCESS_KEY`** — secret key
        - **`AWS_DEFAULT_REGION`** — region used for signing (for example `us-east-1`)
        - **`AWS_S3_ENDPOINT`** — base URL of the S3 API (AWS regional endpoint or your
          S3-compatible gateway)

        These align with Secrets commonly produced by ODH/RHOAI when an S3 connection is created.
      required:
        - bucket
        - key
        - secret_ref
      properties:
        bucket:
          type: string
          description: S3 bucket name.
        key:
          type: string
          description: |
            Object key or prefix within the bucket. Leading and trailing slashes are
            normalized. The init container lists objects under this prefix and downloads
            each into **`/test_data`**, keeping subdirectory structure relative to the prefix.
        secret_ref:
          type: string
          description: |
            Name of a Kubernetes **`Secret`** in the evaluation job namespace, mounted
            read-only into the test-data init container.
      example:
        bucket: my-eval-bucket
        key: datasets/benchmark-a/v1
        secret_ref: my-s3-connection-secret
    TestDataRef:
      type: object
      description: |
        Reference to external test data for a benchmark.

        When the evaluation runs as a **Kubernetes Job**, setting `s3` makes Eval Hub schedule
        an **init container** on the evaluation job pod before the adapter runs. That init container:

        - Reads **`bucket`**, **`key`**, and **`secret_ref`** from the job spec (same JSON
          mounted into the pod).
        - Uses credentials from the Kubernetes **Secret** named by **`secret_ref`**
          (read-only mount; typical sources include an ODH/RHOAI S3 connection Secret).
        - Lists and downloads all objects under the **`key`** prefix into a shared
          **`emptyDir`** volume at **`/test_data`**, preserving relative paths.
        - Exits; the **adapter container** mounts `/test_data` and reads the files.

        Do not put credentials in the Eval Hub API request body; only reference an existing
        Secret via **`secret_ref`**.
      required:
        - s3
      properties:
        s3:
          $ref: '#/components/schemas/TestDataRefS3'
    EvaluationBenchmarkConfig:
      type: object
      description: |
        Reference to a benchmark in an evaluation job (top-level benchmarks or collection override list).
        Does not include a URL; URLs appear only on collection benchmark entries.
      allOf:
        - $ref: '#/components/schemas/Ref'
        - type: object
          required:
            - provider_id
          properties:
            provider_id:
              type: string
              description: Evaluation provider ID.
            weight:
              type: number
              format: float
              minimum: 0
              description: Weight of this benchmark.
            primary_score:
              $ref: '#/components/schemas/PrimaryScore'
            pass_criteria:
              $ref: '#/components/schemas/PassCriteria'
            hardware_config:
              $ref: '#/components/schemas/BenchmarkHardwareConfig'
              description: |
                Optional hardware profile override for this benchmark. Kubernetes runtimes
                only; ignored by local runtimes.
            parameters:
              type: object
              additionalProperties: true
              description: Benchmark specific parameters.
            test_data_ref:
              $ref: '#/components/schemas/TestDataRef'
              description: |
                Optional external test data (e.g. **S3**). For **Kubernetes Job** runs, may
                trigger a pre-run init container that downloads files to **`/test_data`** for
                the adapter. Omit if the benchmark does not need mounted files.
    PassCriteriaWithDefault:
      type: object
      description: Pass/fail threshold criteria.
      properties:
        threshold:
          type: number
          format: float
          description: Threshold value.
          default: 0.5
    ExperimentTag:
      type: object
      description: Tag on an experiment (e.g. MLFlow)
      required:
        - key
        - value
      properties:
        key:
          type: string
          maxLength: 250
          description: Tag key
        value:
          type: string
          maxLength: 5000
          description: Tag value
    ExperimentConfig:
      type: object
      description: Configuration for MLFlow experiment tracking
      properties:
        name:
          type: string
          description: Experiment name
        tags:
          type: array
          items:
            $ref: '#/components/schemas/ExperimentTag'
          maxItems: 20
          description: Experiment tags
        artifact_location:
          type: string
          description: Artifact storage location
    OCICoordinates:
      type: object
      description: OCI artifact coordinates for persistence.
      properties:
        oci_host:
          type: string
          title: OCI Host
          description: OCI registry host (e.g. 'quay.io')
        oci_repository:
          type: string
          title: OCI Repository
          description: OCI repository path (e.g. 'my-org/my-repo')
        oci_tag:
          type: string
          title: OCI Tag
          description: OCI tag (e.g. 'eval-123')
        oci_subject:
          type: string
          title: OCI Subject
          description: Optional OCI subject identifier (in same registry and repo)
        annotations:
          type: object
          additionalProperties:
            type: string
          title: Annotations
          description: Custom annotations for the OCI artifact
      required:
        - oci_host
        - oci_repository
      additionalProperties: false
      title: OCICoordinates
    OCIConnectionConfig:
      type: object
      description: |
        Kubernetes connection configuration for OCI registry authentication. References a Kubernetes Secret of type `kubernetes.io/dockerconfigjson` containing a `.dockerconfigjson` entry with standard Docker registry credentials.
      properties:
        connection:
          type: string
          title: Connection
          description: |
            Name of a Kubernetes Secret (type kubernetes.io/dockerconfigjson) with a ".dockerconfigjson" entry used for OCI registry authentication.
      required:
        - connection
      additionalProperties: false
      title: OCIConnectionConfig
    EvaluationExportsOCI:
      type: object
      description: OCI export configuration for an evaluation job.
      properties:
        coordinates:
          $ref: '#/components/schemas/OCICoordinates'
          description: Coordinates where the OCI artifact should be stored
        k8s:
          $ref: '#/components/schemas/OCIConnectionConfig'
          description: Optional Kubernetes connection for OCI registry authentication
      required:
        - coordinates
      additionalProperties: false
      title: EvaluationExportsOCI
    EvaluationExports:
      type: object
      description: |
        Optional exports configuration for an evaluation job. When provided, the evaluation job results will be exported to the specified location.
      properties:
        oci:
          $ref: '#/components/schemas/EvaluationExportsOCI'
          description: |
            The OCI artifact export configuration.
      additionalProperties: false
      title: EvaluationExports
    QueueConfig:
      type: object
      title: QueueConfig
      description: |
        Optional workload scheduling queue for Kubernetes runtimes (e.g. Kueue LocalQueue).
      required:
        - name
      properties:
        kind:
          type: string
          enum:
            - kueue
          default: kueue
          description: |
            Queue integration to use. When omitted, the server behaves as if `kueue` was set. Only `kueue` is supported today.
        name:
          type: string
          pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
          description: |
            Queue resource name (for Kueue, the LocalQueue name). Must conform to RFC 1123 DNS label naming (lowercase letters, digits, and internal hyphens). Set on the Job as label `kueue.x-k8s.io/queue-name`.
    EvaluationJobConfigCommon2:
      type: object
      title: EvaluationJobConfigCommon2
      description: |
        The evaluation job request configuration.
      properties:
        experiment:
          $ref: '#/components/schemas/ExperimentConfig'
          description: |
            The MLFlow experiment configuration. When provided, the evaluation job will be tracked in MLFlow.
        exports:
          $ref: '#/components/schemas/EvaluationExports'
          description: |
            Optional exports configuration for the evaluation job. When provided, the evaluation job results will be exported to the specified location.
        queue:
          $ref: '#/components/schemas/QueueConfig'
          description: |
            Optional scheduling queue for Kubernetes-backed evaluation jobs (e.g. Kueue).
        custom:
          type: object
          additionalProperties: true
          description: |
            Custom request data. This can be used for user specific job data.
    EvaluationJobConfigBenchmarks:
      allOf:
        - $ref: '#/components/schemas/EvaluationJobConfigCommon1'
        - type: object
          required:
            - benchmarks
          properties:
            benchmarks:
              type: array
              items:
                $ref: '#/components/schemas/EvaluationBenchmarkConfig'
              description: |
                The evaluation benchmarks to run.
            pass_criteria:
              $ref: '#/components/schemas/PassCriteriaWithDefault'
              description: |
                The overall pass criteria for the evaluation job.
        - $ref: '#/components/schemas/EvaluationJobConfigCommon2'
      title: EvaluationJobConfigBenchmarks
    CollectionRef:
      type: object
      description: |
        The collection to use for the evaluation job.
      required:
        - id
      properties:
        id:
          type: string
          description: |
            The unique identifier of the collection.
        benchmarks:
          type: array
          items:
            $ref: '#/components/schemas/EvaluationBenchmarkConfig'
          description: |
            The benchmarks supported by this collection. If a benchmark is specified but is not supported by the collection, the evaluation job will fail. This is needed in order to allow overriding the collection benchmarks for a specific evaluation job.
    EvaluationJobConfigCollection:
      allOf:
        - $ref: '#/components/schemas/EvaluationJobConfigCommon1'
        - type: object
          required:
            - collection
          properties:
            collection:
              $ref: '#/components/schemas/CollectionRef'
            pass_criteria:
              $ref: '#/components/schemas/PassCriteria'
              description: |
                The overall pass criteria for the evaluation job.
                If this is not set then the collection pass criteria will be used.
        - $ref: '#/components/schemas/EvaluationJobConfigCommon2'
      title: EvaluationJobConfigCollection
    EvaluationJobConfig:
      type: object
      title: EvaluationJobConfig
      description: |
        The evaluation job request configuration.
      oneOf:
        - $ref: '#/components/schemas/EvaluationJobConfigBenchmarks'
        - $ref: '#/components/schemas/EvaluationJobConfigCollection'
    EvaluationJobResource:
      type: object
      description: Full evaluation job resource (response)
      allOf:
        - type: object
          properties:
            resource:
              $ref: '#/components/schemas/EvaluationResource'
              description: Resource and evaluation metadata
            status:
              $ref: '#/components/schemas/EvaluationJobStatus'
              description: Current status
            results:
              $ref: '#/components/schemas/EvaluationJobResults'
              description: Results when completed
        - $ref: '#/components/schemas/EvaluationJobConfig'
    EvaluationJobResourceList:
      type: object
      description: List of evaluation job resources with pagination
      allOf:
        - $ref: '#/components/schemas/Page'
        - type: object
          properties:
            items:
              type: array
              items:
                $ref: '#/components/schemas/EvaluationJobResource'
              description: Evaluation job resources
            errors:
              type: array
              items:
                type: string
              description: Non-fatal errors (e.g. partial list)
    AgentMetadata:
      type: object
      description: Metadata for AI agent discoverability and operational guidance
      properties:
        evaluates:
          type: array
          items:
            type: string
          description: Semantic tags describing what this provider measures
        recommended_when:
          type: array
          items:
            type: string
          description: Natural-language conditions for agent recommendation
        target_type:
          type: string
          enum:
            - model
            - agent
            - inference_server
          description: What kind of target this provider evaluates
        summary:
          type: string
          maxLength: 200
          description: Concise description optimised for agent tool listings
        complements:
          type: array
          items:
            type: string
          description: Provider IDs that pair well with this one
        hints:
          type: array
          items:
            type: string
          description: Free-form operational guidance for agents constructing job requests
        result_interpretation:
          type: array
          items:
            type: string
          description: Guidance for interpreting evaluation results
    GPUConfig:
      type: object
      title: GPUConfig
      description: |
        GPU resources required by a Kubernetes provider adapter. Omit the parent `gpu` field on `K8sRuntime` for CPU-only adapters. Kubernetes sets GPU requests and limits to the same value (`count`). When `gpu` is omitted or `count` is 0, the evaluation runs without GPU requests.
      properties:
        resource:
          type: string
          description: |
            Kubernetes extended resource name (for example `nvidia.com/gpu` or `amd.com/gpu`). When omitted, no specific GPU resource is requested in the pod spec; node selection may be governed by Kueue ResourceFlavors or cluster defaults.
          examples:
            - nvidia.com/gpu
        count:
          type: integer
          minimum: 1
          description: |
            Number of GPU units to request. Must be at least 1 when `gpu` is specified.
        node_selector:
          type: object
          additionalProperties:
            type: string
          description: |
            Optional node label key/value pairs added to the evaluation pod to target a GPU model or node pool (for example `nvidia.com/gpu.product: NVIDIA-H100-SXM5-80GB`). Ignored when the evaluation job is submitted with a queue; in that case Kueue ResourceFlavors govern node selection.
          examples:
            - nvidia.com/gpu.product: NVIDIA-H100-SXM5-80GB
    EnvVar:
      type: object
      description: Environment variable for job configuration
      properties:
        name:
          type: string
          description: Environment variable name
        value:
          type: string
          description: Environment variable value
      required:
        - name
        - value
    K8sRuntime:
      type: object
      description: Kubernetes runtime configuration for provider evaluation jobs
      properties:
        image:
          type: string
          description: Container image for the evaluation job (e.g. quay.io/evalhub/adapter:latest)
        entrypoint:
          type: array
          items:
            type: string
          description: Container entrypoint command
        cpu_request:
          type: string
          description: CPU request (e.g. 250m)
        memory_request:
          type: string
          description: Memory request (e.g. 512Mi)
        cpu_limit:
          type: string
          description: CPU limit (e.g. 1)
        memory_limit:
          type: string
          description: Memory limit (e.g. 2Gi)
        gpu:
          $ref: '#/components/schemas/GPUConfig'
          description: |
            GPU resource requirement for this adapter. Omit for CPU-only adapters; existing providers without this field are unchanged.
        env:
          type: array
          items:
            $ref: '#/components/schemas/EnvVar'
          description: Environment variables for the job
        image_pull_policy:
          type: string
          enum:
            - if_not_present
            - always
          description: |
            When the adapter container image is pulled. API values are if_not_present (default when omitted) or always. EvalHub maps these to the Kubernetes imagePullPolicy values IfNotPresent and Always on the adapter container only. Custom providers may set always during development to pick up fresh image tags. Sidecar and init containers are not configurable and always use the Kubernetes IfNotPresent policy.
      required:
        - image
        - entrypoint
    LocalRuntime:
      type: object
      description: Local runtime configuration for running evaluations on the host
      properties:
        command:
          type: string
          description: Command to execute for local evaluation runs
        env:
          type: array
          items:
            $ref: '#/components/schemas/EnvVar'
          description: Environment variables for the local process
      required:
        - command
    Runtime:
      type: object
      description: Provider runtime configuration
      properties:
        k8s:
          $ref: '#/components/schemas/K8sRuntime'
          description: Kubernetes runtime configuration
        local:
          $ref: '#/components/schemas/LocalRuntime'
          description: Local runtime configuration
    ScoreRange:
      type: object
      description: A score range with its interpretation
      properties:
        range:
          type: string
          description: Score range (e.g. "0.0-0.25")
        meaning:
          type: string
          description: What scores in this range indicate
      required:
        - range
        - meaning
    BenchmarkAgentMetadata:
      type: object
      description: Benchmark-level agent guidance (overrides provider defaults)
      properties:
        result_interpretation:
          type: string
          description: How to read results for this specific benchmark
        score_ranges:
          type: array
          items:
            $ref: '#/components/schemas/ScoreRange'
          description: Score ranges with their interpretations for this benchmark
    BenchmarkResource:
      type: object
      description: Benchmark specification (from a provider)
      properties:
        id:
          type: string
          description: Benchmark ID
        url:
          type: string
          description: Optional documentation or info URL for this benchmark (from provider config).
        name:
          type: string
          description: Benchmark name
        description:
          type: string
          description: Benchmark description
        category:
          type: string
          description: Category
        metrics:
          type: array
          items:
            type: string
          description: Metric names
        num_few_shot:
          type: integer
          description: Number of few-shot examples
        dataset_size:
          type: integer
          description: Dataset size
        tags:
          type: array
          items:
            type: string
          description: Tags
        primary_score:
          $ref: '#/components/schemas/PrimaryScore'
        pass_criteria:
          $ref: '#/components/schemas/PassCriteria'
        agent:
          $ref: '#/components/schemas/BenchmarkAgentMetadata'
          description: Agent discoverability metadata for this benchmark
    ProviderConfig:
      type: object
      description: Evaluation provider (configuration and available benchmarks)
      properties:
        name:
          type: string
          description: Provider name
        title:
          type: string
          description: Provider display title
        description:
          type: string
          description: Provider description
        tags:
          type: array
          items:
            type: string
          description: Provider tags
        agent:
          $ref: '#/components/schemas/AgentMetadata'
          description: Agent discoverability metadata for this provider
        runtime:
          $ref: '#/components/schemas/Runtime'
          description: Provider runtime configuration
        benchmarks:
          type: array
          items:
            $ref: '#/components/schemas/BenchmarkResource'
          description: Benchmarks offered by this provider
      required:
        - name
        - benchmarks
        - runtime
    ProviderResource:
      type: object
      description: Provider resource
      allOf:
        - type: object
          properties:
            resource:
              $ref: '#/components/schemas/Resource'
              description: Resource metadata
        - $ref: '#/components/schemas/ProviderConfig'
    ProviderResourceList:
      type: object
      description: List of provider resources with pagination
      allOf:
        - $ref: '#/components/schemas/Page'
        - type: object
          properties:
            items:
              type: array
              items:
                $ref: '#/components/schemas/ProviderResource'
              description: Provider resources
            errors:
              type: array
              items:
                type: string
              description: Non-fatal errors (e.g. partial list)
    PatchOp:
      type: string
      enum:
        - replace
        - add
        - remove
      description: Patch operation type
    PatchOperation:
      type: object
      description: Single JSON patch operation
      properties:
        op:
          $ref: '#/components/schemas/PatchOp'
        path:
          type: string
          description: JSON Pointer path
        value:
          description: Value for add/replace (omit for remove)
      required:
        - op
        - path
    CollectionBenchmarkConfig:
      type: object
      description: |
        Reference to a benchmark in a collection (create, update, list, or get collection).
        The **url** field is set by the server from the provider definition when a collection is created (POST), fully replaced (PUT), or when a whole benchmark array element is added or replaced via PATCH (paths `/benchmarks/<index>` or `/benchmarks/-`), then stored and returned on list/get; omitted when unknown. Field-level benchmark patches (e.g. `/benchmarks/0/id`) do not trigger this. It is not used on evaluation jobs.
      allOf:
        - $ref: '#/components/schemas/Ref'
        - type: object
          required:
            - provider_id
          properties:
            provider_id:
              type: string
              description: Evaluation provider ID.
            url:
              type: string
              description: Benchmark URL from the provider definition (persisted on create, full collection replace, or whole-benchmark PATCH; omitted when unknown).
            weight:
              type: number
              format: float
              minimum: 0
              description: Weight of this benchmark.
            primary_score:
              $ref: '#/components/schemas/PrimaryScore'
            pass_criteria:
              $ref: '#/components/schemas/PassCriteria'
            parameters:
              type: object
              additionalProperties: true
              description: Benchmark specific parameters.
            test_data_ref:
              $ref: '#/components/schemas/TestDataRef'
              description: |
                Optional external test data (e.g. **S3**). For **Kubernetes Job** runs, may
                trigger a pre-run init container that downloads files to **`/test_data`** for
                the adapter. Omit if the benchmark does not need mounted files.
    CollectionConfig:
      type: object
      description: Request to create or update a collection.
      properties:
        name:
          type: string
          description: Collection name.
        category:
          type: string
          description: Collection category.
        description:
          type: string
          description: Optional description.
        tags:
          type: array
          items:
            type: string
          description: Tags.
        custom:
          type: object
          additionalProperties: true
          description: Custom key-value data.
        pass_criteria:
          $ref: '#/components/schemas/PassCriteria'
          description: Pass criteria for the collection.
        benchmarks:
          type: array
          items:
            $ref: '#/components/schemas/CollectionBenchmarkConfig'
          description: Benchmarks in the collection.
      required:
        - name
        - category
        - benchmarks
    CollectionResource:
      type: object
      description: Collection resource
      allOf:
        - type: object
          properties:
            resource:
              $ref: '#/components/schemas/Resource'
              description: Resource metadata
        - $ref: '#/components/schemas/CollectionConfig'
    CollectionResourceList:
      type: object
      description: List of collection resources with pagination
      allOf:
        - $ref: '#/components/schemas/Page'
        - type: object
          properties:
            items:
              type: array
              items:
                $ref: '#/components/schemas/CollectionResource'
              description: Collection resources
  examples:
    BadRequestError:
      summary: Bad request error with status code 400
      value:
        message: The field 'state' is not valid.
        message_code: invalid_value
        trace: b12692e1-8582-4628-88ca-7a13fefb73e2
    AuthenticationError:
      summary: Authentication error with status code 401
      value:
        message: The bearer token is not valid.
        message_code: invalid_auth_token
        trace: b12692e1-8582-4628-88ca-7a13fefb73e2
    AuthorizationError:
      summary: Authorization error with status code 403
      value:
        message: The user does not have access to the endpoint.
        message_code: endpoint_access_forbidden
        trace: b12692e1-8582-4628-88ca-7a13fefb73e2
    NotFoundError:
      summary: Not found error with status code 404
      value:
        message: The resource was not found.
        message_code: not_found
        trace: b12692e1-8582-4628-88ca-7a13fefb73e2
    ConflictError:
      summary: Conflict error with status code 409
      value:
        message: The job 'f02b16a2-1990-4626-b24d-1cff3febdbfb' can not be cancelled because it is 'completed'.
        message_code: conflict
        trace: b12692e1-8582-4628-88ca-7a13fefb73e2
  responses:
    BadRequest:
      description: Bad Request
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/Error'
          examples:
            BadRequestError:
              $ref: '#/components/examples/BadRequestError'
    Unauthorized:
      description: Unauthorized
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/Error'
          examples:
            AuthenticationError:
              $ref: '#/components/examples/AuthenticationError'
    Forbidden:
      description: Forbidden
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/Error'
          examples:
            AuthorizationError:
              $ref: '#/components/examples/AuthorizationError'
    NotFound:
      description: Not Found
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/Error'
          examples:
            NotFoundError:
              $ref: '#/components/examples/NotFoundError'
    Conflict:
      description: Conflict
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/Error'
          examples:
            ConflictError:
              $ref: '#/components/examples/ConflictError'