openapi: 3.0.3 info: title: Runloop Benchmark API version: '0.1' description: "Run and manage Benchmarks and Benchmark Runs \u2014 the evaluation framework for AI coding agents. Supports\ \ SWE-Bench, SWE-smith, and custom benchmark definitions, scenario aggregation, run lifecycle (start/cancel/complete),\ \ scoring, and log retrieval." contact: name: Runloop AI Support url: https://runloop.ai email: support@runloop.ai servers: - url: https://api.runloop.ai description: Runloop API variables: {} tags: - name: Benchmark paths: /v1/benchmark_jobs: post: tags: - Benchmark summary: '[Beta] Create a BenchmarkJob.' description: '[Beta] Create a BenchmarkJob that runs a set of scenarios entirely on runloop.' operationId: createBenchmarkJob parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/BenchmarkJobCreateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkJobView' deprecated: false get: tags: - Benchmark summary: '[Beta] List BenchmarkJobs.' description: '[Beta] List all BenchmarkJobs matching filter.' operationId: listBenchmarkJobs parameters: - name: name in: query description: Filter by name required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkJobListView' deprecated: false /v1/benchmark_jobs/{id}: get: tags: - Benchmark summary: '[Beta] Get a previously created BenchmarkJob.' description: '[Beta] Get a BenchmarkJob given ID.' operationId: getBenchmarkJob parameters: - name: id in: path description: The BenchmarkJob ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkJobView' deprecated: false /v1/benchmark_runs: get: tags: - Benchmark summary: List BenchmarkRuns. description: List all BenchmarkRuns matching filter. operationId: listBenchmarkRuns parameters: - name: name in: query description: Filter by name required: false deprecated: false allowEmptyValue: true schema: type: string - name: benchmark_id in: query description: The Benchmark ID to filter by. required: false deprecated: false allowEmptyValue: true schema: type: string - name: state in: query description: Filter by state required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunListView' deprecated: false /v1/benchmark_runs/{id}: get: tags: - Benchmark summary: Get a previously created BenchmarkRun. description: Get a BenchmarkRun given ID. operationId: getBenchmarkRun parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: false /v1/benchmark_runs/{id}/cancel: post: tags: - Benchmark summary: Cancel a currently running Benchmark run. description: 'Cancel a Benchmark run. This will do the following: 1. Cancel all running scenarios and shutdown the underlying Devbox resources 2. Update the benchmark state to CANCELED 3. Calculate final score from completed scenarios' operationId: cancelBenchmarkRun parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: false /v1/benchmark_runs/{id}/complete: post: tags: - Benchmark summary: Complete a BenchmarkRun. description: Complete a currently running BenchmarkRun. operationId: completeBenchmarkRun parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: false /v1/benchmark_runs/{id}/download_logs: post: tags: - Benchmark summary: Download logs for a Benchmark run. description: Download a zip file containing all logs for a Benchmark run. operationId: downloadBenchmarkRunLogs parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/zip: schema: format: binary headers: Content-Type: description: application/zip required: true schema: type: string Content-Disposition: description: attachment; filename="benchmark_run_logs.zip" required: true schema: type: string deprecated: false /v1/benchmark_runs/{id}/scenario_runs: get: tags: - Benchmark summary: List started scenario runs for a benchmark run. description: List started scenario runs for a benchmark run. operationId: listBenchmarkRunScenarioRuns parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string - name: state in: query description: Filter by Scenario Run state required: false deprecated: false allowEmptyValue: true schema: $ref: '#/components/schemas/ScenarioRunState' - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunListView' deprecated: false /v1/benchmarks: post: tags: - Benchmark summary: Create a Benchmark. description: Create a Benchmark with a set of Scenarios. operationId: createBenchmark parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/BenchmarkCreateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' deprecated: false get: tags: - Benchmark summary: List Benchmarks. description: List all Benchmarks matching filter. operationId: listBenchmarks parameters: - name: name in: query description: Filter by name required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionListView' deprecated: false /v1/benchmarks/list_public: get: tags: - Benchmark summary: List Public Benchmarks. description: List all public benchmarks matching filter. operationId: listPublicBenchmarks parameters: - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionListView' deprecated: false /v1/benchmarks/metadata/keys: get: tags: - Benchmark summary: List available benchmark metadata keys. description: Returns a list of all available metadata keys that can be used for filtering benchmarks. operationId: getBenchmarkMetadataKeys parameters: [] responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/MetadataKeysView' deprecated: false /v1/benchmarks/metadata/keys/{key}/values: get: tags: - Benchmark summary: List values for a specific benchmark metadata key. description: Returns a list of all available metadata keys that can be used for filtering benchmarks. operationId: getBenchmarkMetadataValues parameters: - name: key in: path description: The metadata key to get values for. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/MetadataValuesView' '400': description: Invalid metadata key provided. deprecated: false /v1/benchmarks/runs: get: tags: - Benchmark summary: List BenchmarkRuns. description: List all BenchmarkRuns matching filter. operationId: listBenchmarkRunsDeprecated parameters: - name: name in: query description: Filter by name required: false deprecated: false allowEmptyValue: true schema: type: string - name: benchmark_id in: query description: The Benchmark ID to filter by. required: false deprecated: false allowEmptyValue: true schema: type: string - name: state in: query description: Filter by state required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunListView' deprecated: true /v1/benchmarks/runs/{id}: get: tags: - Benchmark summary: Get a previously created BenchmarkRun. description: Get a BenchmarkRun given ID. operationId: getBenchmarkRunDeprecated parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: true /v1/benchmarks/runs/{id}/cancel: post: tags: - Benchmark summary: Cancel a currently running Benchmark run. description: 'Cancel a Benchmark run. This will do the following: 1. Cancel all running scenarios and shutdown the underlying Devbox resources 2. Update the benchmark state to CANCELED 3. Calculate final score from completed scenarios' operationId: cancelBenchmarkRunDeprecated parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: true /v1/benchmarks/runs/{id}/complete: post: tags: - Benchmark summary: Complete a BenchmarkRun. description: Complete a currently running BenchmarkRun. operationId: completeBenchmarkRunDeprecated parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: true /v1/benchmarks/runs/{id}/download_logs: post: tags: - Benchmark summary: Download logs for a Benchmark run. description: Download a zip file containing all logs for a Benchmark run. operationId: downloadBenchmarkRunLogsDeprecated parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/zip: schema: format: binary headers: Content-Type: description: application/zip required: true schema: type: string Content-Disposition: description: attachment; filename="benchmark_run_logs.zip" required: true schema: type: string deprecated: true /v1/benchmarks/runs/{id}/scenario_runs: get: tags: - Benchmark summary: List started scenario runs for a benchmark run. description: List started scenario runs for a benchmark run. operationId: listBenchmarkRunScenarioRunsDeprecated parameters: - name: id in: path description: The BenchmarkRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string - name: state in: query description: Filter by Scenario Run state required: false deprecated: false allowEmptyValue: true schema: $ref: '#/components/schemas/ScenarioRunState' - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunListView' deprecated: true /v1/benchmarks/start_run: post: tags: - Benchmark summary: Start a new BenchmarkRun. description: Start a new BenchmarkRun based on the provided Benchmark. operationId: startBenchmarkRun parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/StartBenchmarkRunParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunView' deprecated: false /v1/benchmarks/{id}: post: tags: - Benchmark summary: Update a Benchmark. description: Update a Benchmark. Fields that are null will preserve the existing value. Fields that are provided (including empty values) will replace the existing value entirely. operationId: updateBenchmark parameters: - name: id in: path description: The Benchmark ID. required: true deprecated: false allowEmptyValue: false schema: type: string requestBody: content: application/json: schema: $ref: '#/components/schemas/BenchmarkUpdateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' deprecated: false get: tags: - Benchmark summary: Get a Benchmark. description: Get a previously created Benchmark. operationId: getBenchmark parameters: - name: id in: path description: The Benchmark ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' deprecated: false /v1/benchmarks/{id}/archive: post: tags: - Benchmark summary: Archive a Benchmark. description: Archive a previously created Benchmark. The benchmark will no longer appear in list endpoints but can still be retrieved by ID. operationId: archiveBenchmark parameters: - name: id in: path description: The ID of the Benchmark to archive. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' '403': description: Cannot archive public benchmarks. '404': description: Benchmark not found. deprecated: false /v1/benchmarks/{id}/definitions: get: tags: - Benchmark summary: Get scenario definitions for a Benchmark. description: Get scenario definitions for a previously created Benchmark. operationId: getBenchmarkScenarioDefinitions parameters: - name: id in: path description: The Benchmark ID. required: true deprecated: false allowEmptyValue: false schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionListView' deprecated: false /v1/benchmarks/{id}/runs: get: tags: - Benchmark summary: Get runs for a provided Benchmark. description: Get runs for a previously created Benchmark. operationId: getBenchmarkRuns parameters: - name: id in: path description: The Benchmark ID. required: true deprecated: false allowEmptyValue: false schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkRunListView' deprecated: false /v1/benchmarks/{id}/scenarios: post: tags: - Benchmark summary: Modify scenarios for a Benchmark. description: Add and/or remove Scenario IDs from an existing Benchmark. operationId: updateBenchmarkScenarios parameters: - name: id in: path description: The Benchmark ID. required: true deprecated: false allowEmptyValue: false schema: type: string requestBody: content: application/json: schema: $ref: '#/components/schemas/BenchmarkScenarioUpdateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' deprecated: false /v1/benchmarks/{id}/unarchive: post: tags: - Benchmark summary: Unarchive a Benchmark. description: Unarchive a previously archived Benchmark. The benchmark will appear in list endpoints again. operationId: unarchiveBenchmark parameters: - name: id in: path description: The ID of the Benchmark to unarchive. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/BenchmarkDefinitionView' '403': description: Cannot unarchive public benchmarks. '404': description: Benchmark not found. deprecated: false components: schemas: AgentMount: type: object additionalProperties: false properties: agent_id: type: string nullable: true description: The ID of the agent to mount. Either agent_id or name must be set. agent_name: type: string nullable: true description: The name of the agent to mount. Returns the most recent agent with a matching name if no agent id string provided. Either agent id or name must be set agent_path: type: string nullable: true description: Path to mount the agent on the Devbox. Required for git and object agents. Use absolute path (e.g., /home/user/agent) auth_token: type: string nullable: true description: Optional auth token for private repositories. Only used for git agents. type: type: string enum: - agent_mount default: agent_mount required: - agent_id - agent_name - type Architecture: type: string enum: - x86_64 - arm64 AstGrepScoringFunction: type: object additionalProperties: false description: AstGrepScoringFunction utilizes structured coach search for scoring. properties: lang: type: string description: The language of the pattern. search_directory: type: string description: The path to search. pattern: type: string description: AST pattern to match. Pattern will be passed to ast-grep using the commandline surround by double quotes ("), so make sure to use proper escaping (for example, \$\$\$). type: type: string enum: - ast_grep_scorer default: ast_grep_scorer required: - search_directory - pattern - type BashScriptScoringFunction: type: object additionalProperties: false description: BashScriptScoringFunction is a scoring function specified by a bash script that will be run in the context of your environment. properties: bash_script: type: string description: A single bash script that sets up the environment, scores, and prints the final score to standard out. Score should be a float between 0.0 and 1.0, and look like "score=[0.0..1.0]. type: type: string enum: - bash_script_scorer default: bash_script_scorer required: - type BenchmarkCreateParameters: type: object additionalProperties: false description: BenchmarkCreateParameters contain the set of parameters to create a Benchmark. properties: name: type: string description: The unique name of the Benchmark. scenario_ids: type: array items: type: string nullable: true description: The Scenario IDs that make up the Benchmark. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the benchmark. required_environment_variables: type: array items: type: string nullable: true description: Environment variables required to run the benchmark. If any required variables are not supplied, the benchmark will fail to start. required_secret_names: type: array items: type: string description: Secrets required to run the benchmark with (environment variable name will be mapped to the your user secret by name). If any of these secrets are not provided or the mapping is incorrect, the benchmark will fail to start. attribution: type: string nullable: true description: Attribution information for the benchmark. description: type: string nullable: true description: Detailed description of the benchmark. required: - name BenchmarkDefJobSource: type: object additionalProperties: false description: Benchmark definition job source properties: benchmark_id: type: string description: The ID of the benchmark definition benchmark_name: type: string nullable: true description: Optional user-provided name for the benchmark definition type: type: string enum: - benchmark default: benchmark required: - benchmark_id - type BenchmarkDefinitionJobSpec: type: object additionalProperties: false description: Specifies a benchmark definition with runtime configuration. The benchmark definition's scenarios will be executed using the provided agent and orchestrator configurations. properties: benchmark_id: type: string description: ID of the benchmark definition to run. The scenarios from this benchmark will be executed. agent_configs: type: array items: $ref: '#/components/schemas/JobAgentConfig' description: Agent configurations to use for this run. Must specify at least one agent. orchestrator_config: $ref: '#/components/schemas/JobOrchestratorConfig' nullable: true description: Orchestrator configuration (optional overrides). If not provided, default values will be used. type: type: string enum: - benchmark default: benchmark required: - benchmark_id - agent_configs - type BenchmarkDefinitionListView: type: object additionalProperties: false properties: benchmarks: type: array items: $ref: '#/components/schemas/BenchmarkDefinitionView' description: List of Benchmarks matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - benchmarks - has_more BenchmarkDefinitionView: type: object additionalProperties: false description: A BenchmarkDefinitionView represents a grouped set of Scenarios that together form a Benchmark. properties: id: type: string description: The ID of the Benchmark. name: type: string description: The name of the Benchmark. scenarioIds: type: array items: type: string description: List of Scenario IDs that make up the benchmark. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the benchmark for organization. required_environment_variables: type: array items: type: string description: Required environment variables used to run the benchmark. If any required environment variables are missing, the benchmark will fail to start. required_secret_names: type: array items: type: string description: Required secrets used to run the benchmark. If any required secrets are missing, the benchmark will fail to start. is_public: type: boolean description: Whether this benchmark is public. attribution: type: string description: Attribution information for the benchmark. description: type: string description: Detailed description of the benchmark. status: $ref: '#/components/schemas/BenchmarkStatus' description: Whether the benchmark is active or archived. Archived benchmarks are excluded from listings and cannot be run. required: - id - name - scenarioIds - metadata - status BenchmarkJobCreateParameters: type: object additionalProperties: false description: BenchmarkJobCreateParameters contain the set of parameters to create a BenchmarkJob. properties: name: type: string nullable: true description: The name of the BenchmarkJob. If not provided, name will be generated based on target dataset. spec: $ref: '#/components/schemas/BenchmarkJobSpec' nullable: true description: The job specification. Exactly one spec type must be set. BenchmarkJobListView: type: object additionalProperties: false properties: jobs: type: array items: $ref: '#/components/schemas/BenchmarkJobView' description: List of BenchmarkJobs matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - jobs - has_more BenchmarkJobSpec: oneOf: - $ref: '#/components/schemas/HarborJobSpec' - $ref: '#/components/schemas/BenchmarkDefinitionJobSpec' - $ref: '#/components/schemas/ScenarioDefinitionJobSpec' discriminator: propertyName: type mapping: harbor: '#/components/schemas/HarborJobSpec' benchmark: '#/components/schemas/BenchmarkDefinitionJobSpec' scenarios: '#/components/schemas/ScenarioDefinitionJobSpec' BenchmarkJobState: type: string enum: - initializing - queued - running - completed - failed - cancelled - timeout BenchmarkJobView: type: object additionalProperties: false description: A BenchmarkJobView represents a benchmark job that runs a set of scenarios entirely on runloop. properties: id: type: string description: The ID of the BenchmarkJob. name: type: string description: The unique name of the BenchmarkJob. state: $ref: '#/components/schemas/BenchmarkJobState' description: The current state of the benchmark job. job_source: $ref: '#/components/schemas/JobSource' nullable: true description: The source configuration that was used to create this job. Either Harbor YAML or benchmark definition reference. job_spec: $ref: '#/components/schemas/JobSpec' nullable: true description: The resolved job specification. Contains scenarios, agents, and orchestrator config. failure_reason: type: string nullable: true description: Failure reason if job failed. benchmark_outcomes: type: array items: $ref: '#/components/schemas/BenchmarkOutcomeView' nullable: true description: Detailed outcome data for each benchmark run created by this job. Includes per-agent results and scenario-level details. in_progress_runs: type: array items: $ref: '#/components/schemas/InProgressRunView' nullable: true description: Benchmark runs currently in progress for this job. Shows runs that have not yet completed. create_time_ms: type: integer format: int64 description: Timestamp when job was created (Unix milliseconds). required: - id - name - state - create_time_ms BenchmarkOutcomeView: type: object additionalProperties: false description: Outcome data for a single benchmark run within a benchmark job, representing results for one agent configuration. properties: benchmark_run_id: type: string description: The ID of the benchmark run. agent_name: type: string description: The name of the agent configuration used. model_name: type: string nullable: true description: The model name used by the agent. n_completed: type: integer format: int32 description: Number of scenarios that completed successfully. n_failed: type: integer format: int32 description: Number of scenarios that failed. n_timeout: type: integer format: int32 description: Number of scenarios that timed out. average_score: type: number format: float nullable: true description: Average score across all completed scenarios (0.0 to 1.0). duration_ms: type: integer format: int64 nullable: true description: Total duration of the benchmark run in milliseconds. scenario_outcomes: type: array items: $ref: '#/components/schemas/ScenarioOutcomeView' description: Detailed outcomes for each scenario in this benchmark run. required: - benchmark_run_id - agent_name - n_completed - n_failed - n_timeout - scenario_outcomes BenchmarkRunListView: type: object additionalProperties: false properties: runs: type: array items: $ref: '#/components/schemas/BenchmarkRunView' description: List of BenchmarkRuns matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - runs - has_more BenchmarkRunState: type: string enum: - running - canceled - completed - failed BenchmarkRunView: type: object additionalProperties: false description: A BenchmarkRunView represents a run of a complete set of Scenarios, organized under a Benchmark or created by a BenchmarkJob. properties: id: type: string description: The ID of the BenchmarkRun. benchmark_id: type: string nullable: true description: The ID of the Benchmark definition. Present if run was created from a benchmark definition. name: type: string nullable: true description: The name of the BenchmarkRun. start_time_ms: type: integer format: int64 description: The time the benchmark run execution started (Unix timestamp milliseconds). duration_ms: type: integer format: int64 nullable: true description: The duration for the BenchmarkRun to complete. state: $ref: '#/components/schemas/BenchmarkRunState' description: The state of the BenchmarkRun. score: type: number format: float nullable: true description: The final score across the BenchmarkRun, present once completed. Calculated as sum of scenario scores / number of scenario runs. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the benchmark run for organization. purpose: type: string nullable: true description: Purpose of the run. environment_variables: type: object additionalProperties: type: string nullable: true description: Environment variables used to run the benchmark. secrets_provided: type: object additionalProperties: type: string nullable: true description: 'User secrets used to run the benchmark. Example: {"DB_PASS": "DATABASE_PASSWORD"} would set the environment variable ''DB_PASS'' on all scenario devboxes to the value of the secret ''DATABASE_PASSWORD''.' required: - id - start_time_ms - state - metadata BenchmarkScenarioUpdateParameters: type: object additionalProperties: false properties: scenarios_to_add: type: array items: type: string nullable: true description: Scenario IDs to add to the Benchmark. scenarios_to_remove: type: array items: type: string nullable: true description: Scenario IDs to remove from the Benchmark. BenchmarkStatus: type: string enum: - active - archived BenchmarkUpdateParameters: type: object additionalProperties: false description: BenchmarkUpdateParameters contain the set of parameters to update a Benchmark. All fields are optional - null fields preserve existing values, provided fields replace entirely. properties: name: type: string nullable: true description: The unique name of the Benchmark. Cannot be blank. scenario_ids: type: array items: type: string nullable: true description: The Scenario IDs that make up the Benchmark. Pass in empty list to clear. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the benchmark. Pass in empty map to clear. required_environment_variables: type: array items: type: string nullable: true description: Environment variables required to run the benchmark. If any required variables are not supplied, the benchmark will fail to start. Pass in empty list to clear. required_secret_names: type: array items: type: string nullable: true description: Secrets required to run the benchmark with (environment variable name will be mapped to the your user secret by name). If any of these secrets are not provided or the mapping is incorrect, the benchmark will fail to start. Pass in empty list to clear. attribution: type: string nullable: true description: Attribution information for the benchmark. Pass in empty string to clear. description: type: string nullable: true description: Detailed description of the benchmark. Pass in empty string to clear. BrokerMount: type: object additionalProperties: false properties: axon_id: type: string description: The ID of the axon event stream to mount onto the Devbox. protocol: $ref: '#/components/schemas/BrokerMountProtocol' nullable: true description: The protocol used by the broker to deliver events to the agent. agent_binary: type: string nullable: true description: Binary to launch the agent (e.g., 'opencode'). Used by protocols that launch a subprocess (acp, claude_json). working_directory: type: string nullable: true description: Working directory in which to launch the agent binary. Defaults to the home directory if not specified. launch_args: type: array items: type: string nullable: true description: Arguments to pass to the agent command (e.g., ['acp']). Used by protocols that launch a subprocess (acp, claude_json). type: type: string enum: - broker_mount default: broker_mount required: - axon_id - type BrokerMountProtocol: type: string enum: - acp - claude_json BuiltInScoringFunction: oneOf: - $ref: '#/components/schemas/AstGrepScoringFunction' - $ref: '#/components/schemas/BashScriptScoringFunction' - $ref: '#/components/schemas/CommandScoringFunction' - $ref: '#/components/schemas/CustomScoringFunction' - $ref: '#/components/schemas/PythonScriptScoringFunction' - $ref: '#/components/schemas/TestBasedScoringFunction' discriminator: propertyName: type mapping: ast_grep_scorer: '#/components/schemas/AstGrepScoringFunction' bash_script_scorer: '#/components/schemas/BashScriptScoringFunction' command_scorer: '#/components/schemas/CommandScoringFunction' custom_scorer: '#/components/schemas/CustomScoringFunction' python_script_scorer: '#/components/schemas/PythonScriptScoringFunction' test_based_scorer: '#/components/schemas/TestBasedScoringFunction' CodeMount: type: object additionalProperties: false properties: repo_name: type: string description: The name of the repo to mount. By default, code will be mounted at /home/user/{repo_name}. repo_owner: type: string description: The owner of the repo. install_command: type: string nullable: true description: Installation command to install and setup repository. git_ref: type: string nullable: true description: Optional git ref (branch or tag) to checkout. Defaults to the repository default branch. token: type: string nullable: true description: The authentication token necessary to pull repo. type: type: string enum: - code_mount default: code_mount required: - repo_name - repo_owner - type CommandScoringFunction: type: object additionalProperties: false description: CommandScoringFunction executes a single command and checks the result.The output of the command will be printed. Scoring will passed if the command returns status code 0, otherwise it will be failed. properties: command: type: string description: The command to execute. type: type: string enum: - command_scorer default: command_scorer required: - type CustomScoringFunction: type: object additionalProperties: false description: CustomScoringFunction is a custom, user defined scoring function. properties: custom_scorer_type: type: string description: Type of the scoring function, previously registered with Runloop. scorer_params: type: object nullable: true description: Additional JSON structured context to pass to the scoring function. type: type: string enum: - custom_scorer default: custom_scorer required: - custom_scorer_type - type ExternalApiAgentConfig: type: object additionalProperties: false description: Configuration for externally-driven benchmark runs via API properties: info: type: string nullable: true description: Placeholder for future external agent metadata type: type: string enum: - external_api default: external_api required: - type FailureReason: type: object additionalProperties: false description: Information about why a scenario execution failed properties: exception_type: type: string description: The exception class name (e.g., 'TimeoutException', 'AgentTimeoutError') exception_message: type: string description: The exception message providing context required: - exception_type - exception_message FileMount: type: object additionalProperties: false properties: target: type: string description: Target path where the file should be mounted. content: type: string description: Content of the file to mount. type: type: string enum: - file_mount default: file_mount required: - target - content - type HarborJobSource: type: object additionalProperties: false description: Harbor job source with inline YAML configuration properties: inline_yaml: type: string description: The Harbor job configuration as inline YAML content type: type: string enum: - harbor default: harbor required: - inline_yaml - type HarborJobSpec: type: object additionalProperties: false description: Harbor-based job specification with inline YAML configuration. properties: inline_yaml: type: string description: The Harbor job configuration as inline YAML content. type: type: string enum: - harbor default: harbor required: - inline_yaml - type IdleAction: type: string enum: - shutdown - suspend description: 'Action to take after Devbox idle timer is triggered. shutdown: Shutdown the Devbox. suspend: Suspend the Devbox. ' x-enum-descriptions: shutdown: Shutdown the Devbox. suspend: Suspend the Devbox. IdleConfigurationParameters: type: object additionalProperties: false properties: idle_time_seconds: type: integer format: int32 description: After idle_time_seconds, on_idle action will be taken. on_idle: $ref: '#/components/schemas/IdleAction' description: Action to take after Devbox becomes idle. required: - idle_time_seconds - on_idle InProgressRunView: type: object additionalProperties: false description: A lightweight view of a benchmark run currently in progress, showing basic execution details without full outcome data. properties: benchmark_run_id: type: string description: The ID of the benchmark run. agent_config: $ref: '#/components/schemas/RunAgentConfig' nullable: true description: Agent configuration used for this run. Specifies whether the run was driven by an external API agent or a job-defined agent. state: $ref: '#/components/schemas/BenchmarkRunState' description: The current state of the run. start_time_ms: type: integer format: int64 description: Start time (Unix milliseconds). duration_ms: type: integer format: int64 nullable: true description: Duration so far in milliseconds. required: - benchmark_run_id - state - start_time_ms InputContext: type: object additionalProperties: false description: InputContextView specifies the problem statement along with all additional context for a Scenario. properties: problem_statement: type: string description: The problem statement for the Scenario. additional_context: type: object nullable: true description: Additional JSON structured input context. required: - problem_statement JobAgentConfig: type: object additionalProperties: false description: Configuration for an agent in a benchmark job properties: agent_id: type: string nullable: true description: ID of the agent to use (optional if agent exists by name) name: type: string description: Name of the agent model_name: type: string nullable: true description: Model name override for this agent timeout_seconds: type: number format: float nullable: true description: Timeout in seconds for this agent kwargs: type: object additionalProperties: type: string nullable: true description: Additional kwargs for agent configuration agent_environment: $ref: '#/components/schemas/JobAgentEnvironment' nullable: true description: Environment configuration to use for this agent type: type: string enum: - job_agent default: job_agent required: - name - type JobAgentEnvironment: type: object additionalProperties: false description: Environment configuration for an agent in a benchmark job properties: environment_variables: type: object additionalProperties: type: string nullable: true description: Environment variables to set when launching the agent. secrets: type: object additionalProperties: type: string nullable: true description: Secrets to inject as environment variables when launching the agent. Map of environment variable names to secret IDs. JobOrchestratorConfig: type: object additionalProperties: false description: Orchestrator configuration for benchmark job execution properties: n_concurrent_trials: type: integer format: int32 nullable: true description: 'Number of concurrent trials to run (default: 1). Controls parallelism for scenario execution. Default is 1.' n_attempts: type: integer format: int32 nullable: true description: 'Number of retry attempts on failure (default: 0). This is the retry policy for failed scenarios. Default is 0.' timeout_multiplier: type: number format: float nullable: true description: 'Timeout multiplier for retries (default: 1.0). Each retry will multiply the timeout by this factor.' quiet: type: boolean nullable: true description: 'Suppress verbose output (default: false)' JobSource: oneOf: - $ref: '#/components/schemas/HarborJobSource' - $ref: '#/components/schemas/BenchmarkDefJobSource' - $ref: '#/components/schemas/ScenariosJobSource' discriminator: propertyName: type mapping: harbor: '#/components/schemas/HarborJobSource' benchmark: '#/components/schemas/BenchmarkDefJobSource' scenarios: '#/components/schemas/ScenariosJobSource' JobSpec: type: object additionalProperties: false description: Job specification describing scenarios and execution configuration properties: scenario_ids: type: array items: type: string description: List of scenario IDs to execute orchestrator_config: $ref: '#/components/schemas/JobOrchestratorConfig' nullable: true description: Orchestrator configuration agent_configs: type: array items: $ref: '#/components/schemas/JobAgentConfig' description: Agent configurations for this job required: - scenario_ids - agent_configs LaunchParameters: type: object additionalProperties: false description: LaunchParameters enable you to customize the resources available to your Devbox as well as the environment set up that should be completed before the Devbox is marked as 'running'. properties: launch_commands: type: array items: type: string nullable: true description: Set of commands to be run at launch time, before the entrypoint process is run. resource_size_request: $ref: '#/components/schemas/ResourceSize' nullable: true description: 'Preset Devbox resources (vCPU, RAM in GiB, ephemeral disk in GiB). If not set, SMALL is used. X_SMALL: 0.5 vCPU, 1 GiB RAM, 4 GiB disk. SMALL: 1 vCPU, 2 GiB RAM, 4 GiB disk. MEDIUM: 2 vCPU, 4 GiB RAM, 8 GiB disk. LARGE: 2 vCPU, 8 GiB RAM, 16 GiB disk. X_LARGE: 4 vCPU, 16 GiB RAM, 16 GiB disk. XX_LARGE: 8 vCPU, 32 GiB RAM, 16 GiB disk. CUSTOM_SIZE: set custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size.' available_ports: type: array items: type: integer format: int32 nullable: true description: '[Deprecated] A list of ports to make available on the Devbox. This field is ignored.' keep_alive_time_seconds: type: integer format: int64 nullable: true description: Time in seconds after which Devbox will automatically shutdown. Default is 1 hour. Maximum is 48 hours (172800 seconds). after_idle: $ref: '#/components/schemas/IdleConfigurationParameters' nullable: true description: Configure Devbox lifecycle based on idle activity. If after_idle is set, Devbox will ignore keep_alive_time_seconds. If both after_idle and lifecycle.after_idle are set, they must have the same value. Use lifecycle.after_idle instead. custom_cpu_cores: type: integer format: int32 nullable: true description: Custom CPU cores. Must be 0.5, 1, or a multiple of 2. Max is 16. custom_gb_memory: type: integer format: int32 nullable: true description: Custom memory size in GiB. Must be 1 or a multiple of 2. Max is 64GiB. custom_disk_size: type: integer format: int32 nullable: true description: Custom disk size in GiB. Must be a multiple of 2. Min is 2GiB, max is 64GiB. architecture: $ref: '#/components/schemas/Architecture' nullable: true description: The target architecture for the Devbox. If unset, defaults to x86_64. user_parameters: $ref: '#/components/schemas/UserParameters' nullable: true description: Specify the user for execution on Devbox. If not set, default `user` will be used. required_services: type: array items: type: string nullable: true description: A list of ContainerizedService names to be started when a Devbox is created. A valid ContainerizedService must be specified in Blueprint to be started. network_policy_id: type: string nullable: true description: (Optional) ID of the network policy to apply to Devboxes launched with these parameters. When set on a Blueprint launch parameters, Devboxes created from it will inherit this policy unless explicitly overridden. lifecycle: $ref: '#/components/schemas/LifecycleConfigurationParameters' nullable: true description: Lifecycle configuration for idle and resume behavior. Configure idle policy via lifecycle.after_idle (if both this and the top-level after_idle are set, they must match) and resume triggers via lifecycle.resume_triggers. LifecycleConfigurationParameters: type: object additionalProperties: false description: Lifecycle configuration for Devbox idle and resume behavior. Configure idle policy via after_idle and resume triggers via resume_triggers. properties: after_idle: $ref: '#/components/schemas/IdleConfigurationParameters' nullable: true description: Configure Devbox lifecycle based on idle activity. If both this and the top-level after_idle are set, they must have the same value. Prefer this field for new integrations. resume_triggers: $ref: '#/components/schemas/ResumeTriggers' nullable: true description: Triggers that can resume a suspended Devbox. MetadataKeysView: type: object additionalProperties: false properties: keys: type: array items: type: string MetadataValuesView: type: object additionalProperties: false properties: key: type: string values: type: array items: type: string Mount: oneOf: - $ref: '#/components/schemas/ObjectMount' - $ref: '#/components/schemas/AgentMount' - $ref: '#/components/schemas/CodeMount' - $ref: '#/components/schemas/FileMount' - $ref: '#/components/schemas/BrokerMount' discriminator: propertyName: type mapping: object_mount: '#/components/schemas/ObjectMount' agent_mount: '#/components/schemas/AgentMount' code_mount: '#/components/schemas/CodeMount' file_mount: '#/components/schemas/FileMount' broker_mount: '#/components/schemas/BrokerMount' ObjectMount: type: object additionalProperties: false properties: object_id: type: string description: The ID of the object to write. object_path: type: string description: The path to write the object on the Devbox. Use absolute path of object (ie /home/user/object.txt, or directory if archive /home/user/archive_dir) type: type: string enum: - object_mount default: object_mount required: - object_id - object_path - type PythonScriptScoringFunction: type: object additionalProperties: false description: PythonScriptScoringFunction will run a python script in the context of your environment as a ScoringFunction. properties: requirements_contents: type: string nullable: true description: Package dependencies to be installed. The requirements should be a valid requirements.txt file. python_script: type: string description: Python script to be run. The script should output the score to standard out as a float between 0.0 and 1.0. python_version_constraint: type: string nullable: true description: Python version to run scoring. Default is "==3.12.10" type: type: string enum: - python_script_scorer default: python_script_scorer required: - python_script - type ResourceSize: type: string enum: - X_SMALL - SMALL - MEDIUM - LARGE - X_LARGE - XX_LARGE - CUSTOM_SIZE description: 'The size of the Devbox resources for Runloop to allocate. X_SMALL: 0.5 cpu x 1GiB memory x 4GiB disk SMALL: 1 cpu x 2GiB memory x 4GiB disk MEDIUM: 2 cpu x 4GiB memory x 8GiB disk LARGE: 2 cpu x 8GiB memory x 16GiB disk X_LARGE: 4 cpu x 16GiB memory x 16GiB disk XX_LARGE: 8 cpu x 32GiB memory x 16GiB disk CUSTOM_SIZE: To choose a custom size, set this enum and also the custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size in launch parameters. CPU must be 0.5, 1, or a multiple of 2 (max 16). Memory must be 1 or a multiple of 2 (max 64GiB). Disk must be a multiple of 2 (min 2GiB, max 64GiB). The cpu:memory ratio must be between 1:2 and 1:8 inclusive. ' x-enum-descriptions: X_SMALL: 0.5 cpu x 1GiB memory x 4GiB disk SMALL: 1 cpu x 2GiB memory x 4GiB disk MEDIUM: 2 cpu x 4GiB memory x 8GiB disk LARGE: 2 cpu x 8GiB memory x 16GiB disk X_LARGE: 4 cpu x 16GiB memory x 16GiB disk XX_LARGE: 8 cpu x 32GiB memory x 16GiB disk CUSTOM_SIZE: To choose a custom size, set this enum and also the custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size in launch parameters. CPU must be 0.5, 1, or a multiple of 2 (max 16). Memory must be 1 or a multiple of 2 (max 64GiB). Disk must be a multiple of 2 (min 2GiB, max 64GiB). The cpu:memory ratio must be between 1:2 and 1:8 inclusive. ResumeTriggers: type: object additionalProperties: false description: Triggers that can resume a suspended Devbox. properties: http: type: boolean nullable: true description: When true, HTTP traffic to a suspended Devbox via tunnel will trigger a resume. axon_event: type: boolean nullable: true description: When true, axon events targeting a suspended Devbox will trigger a resume. RunAgentConfig: oneOf: - $ref: '#/components/schemas/ExternalApiAgentConfig' - $ref: '#/components/schemas/JobAgentConfig' discriminator: propertyName: type mapping: external_api: '#/components/schemas/ExternalApiAgentConfig' job_agent: '#/components/schemas/JobAgentConfig' RunProfile: type: object additionalProperties: false properties: purpose: type: string nullable: true description: Purpose of the run. envVars: type: object additionalProperties: type: string nullable: true description: 'Mapping of Environment Variable to Value. May be shown in devbox logging. Example: {"DB_PASS": "DATABASE_PASSWORD"} would set the environment variable ''DB_PASS'' to the value ''DATABASE_PASSWORD_VALUE''.' secrets: type: object additionalProperties: type: string nullable: true description: 'Mapping of Environment Variable to User Secret Name. Never shown in devbox logging. Example: {"DB_PASS": "DATABASE_PASSWORD"} would set the environment variable ''DB_PASS'' to the value of the secret ''DATABASE_PASSWORD''.' launchParameters: $ref: '#/components/schemas/LaunchParameters' nullable: true description: Additional runtime LaunchParameters to apply after the devbox starts. mounts: type: array items: $ref: '#/components/schemas/Mount' nullable: true description: A list of mounts to be included in the scenario run. ScenarioDefinitionJobSpec: type: object additionalProperties: false description: Specifies a set of scenarios with runtime configuration. The scenarios will be executed using the provided agent and orchestrator configurations. properties: scenario_ids: type: array items: type: string description: List of scenario IDs to execute agent_configs: type: array items: $ref: '#/components/schemas/JobAgentConfig' description: Agent configurations to use for this run. Must specify at least one agent. orchestrator_config: $ref: '#/components/schemas/JobOrchestratorConfig' nullable: true description: Orchestrator configuration (optional overrides). If not provided, default values will be used. type: type: string enum: - scenarios default: scenarios required: - scenario_ids - agent_configs - type ScenarioDefinitionListView: type: object additionalProperties: false properties: scenarios: type: array items: $ref: '#/components/schemas/ScenarioDefinitionView' description: List of Scenarios matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - scenarios - has_more ScenarioDefinitionStatus: type: string enum: - active - archived ScenarioDefinitionView: type: object additionalProperties: false description: A ScenarioDefinitionView represents a repeatable AI coding evaluation test, complete with initial environment and scoring contract. properties: id: type: string description: The ID of the Scenario. name: type: string description: The name of the Scenario. environment: $ref: '#/components/schemas/ScenarioEnvironment' nullable: true description: The Environment in which the Scenario is run. input_context: $ref: '#/components/schemas/InputContext' description: The input context for the Scenario. scoring_contract: $ref: '#/components/schemas/ScoringContract' description: The scoring contract for the Scenario. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the scenario for organization. reference_output: type: string nullable: true description: A string representation of the reference output to solve the scenario. Commonly can be the result of a git diff or a sequence of command actions to apply to the environment. required_environment_variables: type: array items: type: string description: Environment variables required to run the scenario. If any required environment variables are missing, the scenario will fail to start. required_secret_names: type: array items: type: string description: Environment variables required to run the scenario. If any required secrets are missing, the scenario will fail to start. is_public: type: boolean description: Whether this scenario is public. validation_type: $ref: '#/components/schemas/ValidationType' nullable: true description: Validation strategy. scorer_timeout_sec: type: integer format: int32 nullable: true description: Timeout for scoring in seconds. Default 30 minutes (1800s). status: $ref: '#/components/schemas/ScenarioDefinitionStatus' description: Whether the scenario is active or archived. Archived scenarios are excluded from listings and cannot be updated. required: - id - name - input_context - scoring_contract - metadata - status ScenarioEnvironment: type: object additionalProperties: false description: ScenarioEnvironmentParameters specify the environment in which a Scenario will be run. properties: blueprint_id: type: string nullable: true description: Use the blueprint with matching ID. snapshot_id: type: string nullable: true description: Use the snapshot with matching ID. launch_parameters: $ref: '#/components/schemas/LaunchParameters' nullable: true description: Optional launch parameters to apply to the devbox environment at launch. working_directory: type: string nullable: true description: The working directory where the agent is expected to fulfill the scenario. Scoring functions also run from the working directory. ScenarioOutcomeView: type: object additionalProperties: false description: Outcome data for a single scenario execution, including its final state and scoring results. properties: scenario_run_id: type: string nullable: true description: The ID of the scenario run. May be absent if the scenario failed during setup before a run was created. scenario_definition_id: type: string description: The ID of the scenario definition that was executed. scenario_name: type: string description: The name of the scenario. state: $ref: '#/components/schemas/ScenarioState' description: The final state of the scenario execution. score: type: number format: float nullable: true description: The score achieved for this scenario (0.0 to 1.0). Only present if state is COMPLETED. duration_ms: type: integer format: int64 nullable: true description: Duration of the scenario execution in milliseconds. failure_reason: $ref: '#/components/schemas/FailureReason' nullable: true description: Failure information if the scenario failed or timed out. Contains exception type and message. required: - scenario_definition_id - scenario_name - state ScenarioRunListView: type: object additionalProperties: false properties: runs: type: array items: $ref: '#/components/schemas/ScenarioRunView' description: List of ScenarioRuns matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - runs - has_more ScenarioRunState: type: string enum: - running - scoring - scored - completed - canceled - timeout - failed ScenarioRunView: type: object additionalProperties: false description: A ScenarioRunView represents a single run of a Scenario on a Devbox. When completed, the ScenarioRun will contain the final score and output of the run. properties: id: type: string description: ID of the ScenarioRun. name: type: string nullable: true description: Optional name of ScenarioRun. scenario_id: type: string description: ID of the Scenario that has been run. devbox_id: type: string description: ID of the Devbox on which the Scenario is running. benchmark_run_id: type: string nullable: true description: ID of the BenchmarkRun that this Scenario is associated with, if any. scoring_contract_result: $ref: '#/components/schemas/ScoringContractResultView' nullable: true description: The scoring result of the ScenarioRun. start_time_ms: type: integer format: int64 description: The time that the scenario started duration_ms: type: integer format: int64 nullable: true description: Duration scenario took to run. state: $ref: '#/components/schemas/ScenarioRunState' description: The state of the ScenarioRun. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the scenario run for organization. purpose: type: string nullable: true description: Purpose of the ScenarioRun. environment_variables: type: object additionalProperties: type: string nullable: true description: Environment variables used to run the scenario. secrets_provided: type: object additionalProperties: type: string nullable: true description: User secrets used to run the scenario. required: - id - scenario_id - devbox_id - state - metadata ScenarioState: type: string enum: - COMPLETED - FAILED - TIMEOUT - CANCELED ScenariosJobSource: type: object additionalProperties: false description: Scenarios job source with a list of scenario definition IDs properties: scenario_ids: type: array items: type: string description: List of scenario definition IDs to execute type: type: string enum: - scenarios default: scenarios required: - scenario_ids - type ScoringContract: type: object additionalProperties: false description: InputContextView specifies the problem statement along with all additional context for a Scenario. properties: scoring_function_parameters: type: array items: $ref: '#/components/schemas/ScoringFunction' description: A list of scoring functions used to evaluate the Scenario. required: - scoring_function_parameters ScoringContractResultView: type: object additionalProperties: false description: A ScoringContractResultView represents the result of running all scoring functions on a given input context. properties: score: type: number format: float description: Total score for all scoring contracts. This will be a value between 0 and 1. scoring_function_results: type: array items: $ref: '#/components/schemas/ScoringFunctionResultView' description: List of all individual scoring function results. required: - score - scoring_function_results ScoringFunction: type: object additionalProperties: false description: ScoringFunction specifies a method of scoring a Scenario. properties: name: type: string description: Name of scoring function. Names must only contain [a-zA-Z0-9_-]. scorer: $ref: '#/components/schemas/BuiltInScoringFunction' description: The scoring function to use for evaluating this scenario. The type field determines which built-in function to use. weight: type: number format: float description: Weight to apply to scoring function score. Weights of all scoring functions should sum to 1.0. required: - name - scorer - weight ScoringFunctionResultView: type: object additionalProperties: false description: A ScoringFunctionResultView represents the result of running a single scoring function on a given input context. properties: score: type: number format: float description: Final score for the given scoring function. scoring_function_name: type: string description: Scoring function name that ran. output: type: string description: Log output of the scoring function. state: $ref: '#/components/schemas/ScoringFunctionResultViewState' description: The state of the scoring function application. required: - score - scoring_function_name - output - state ScoringFunctionResultViewState: type: string enum: - unknown - complete - error StartBenchmarkRunParameters: type: object additionalProperties: false properties: benchmark_id: type: string description: ID of the Benchmark to run. run_name: type: string nullable: true description: Display name of the run. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the benchmark run for organization. runProfile: $ref: '#/components/schemas/RunProfile' nullable: true description: Runtime configuration to use for this benchmark run required: - benchmark_id TestBasedScoringFunction: type: object additionalProperties: false description: TestBasedScoringFunction writes test files to disk and executes a test command to verify the solution. properties: test_files: type: array items: $ref: '#/components/schemas/TestFile' description: List of test files to create test_command: type: string description: The command to execute for running the tests type: type: string enum: - test_based_scorer default: test_based_scorer required: - type TestFile: type: object additionalProperties: false properties: file_path: type: string description: Path to write content of the test file, relative to your environment's working directory file_contents: type: string description: Content of the test file UserParameters: type: object additionalProperties: false description: Configuration for the Linux user in the Devbox environment. properties: username: type: string description: Username for the Linux user. uid: type: integer format: int32 description: User ID (UID) for the Linux user. Must be a non-negative integer. required: - username - uid ValidationType: type: string enum: - UNSPECIFIED - FORWARD - REVERSE - EVALUATION securitySchemes: bearerAuth: scheme: bearer type: http security: - bearerAuth: []