openapi: 3.0.3 info: title: Runloop Scenario API version: '0.1' description: "Define and run individual evaluation Scenarios \u2014 the atomic unit of benchmarks. Supports scenario authoring,\ \ scenario runs lifecycle, scoring (ScenarioScorer), and metadata." contact: name: Runloop AI Support url: https://runloop.ai email: support@runloop.ai servers: - url: https://api.runloop.ai description: Runloop API variables: {} tags: - name: Scenario - name: ScenarioScorer paths: /v1/scenarios: post: tags: - Scenario summary: Create a Scenario. description: Create a Scenario, a repeatable AI coding evaluation test that defines the starting environment as well as evaluation success criteria. operationId: createScenario parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/ScenarioCreateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' deprecated: false get: tags: - Scenario summary: List Scenarios. description: List all Scenarios matching filter. operationId: listScenarios parameters: - name: name in: query description: Query for Scenarios with a given name. required: false deprecated: false allowEmptyValue: true schema: type: string - name: benchmark_id in: query description: Filter scenarios by benchmark ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: validation_type in: query description: Filter by validation type required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionListView' deprecated: false /v1/scenarios/list_public: get: tags: - Scenario summary: List Public Scenarios. description: List all public scenarios matching filter. operationId: listPublicScenarios parameters: - name: name in: query description: Query for Scenarios with a given name. required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionListView' deprecated: false /v1/scenarios/metadata/keys: get: tags: - Scenario summary: List available scenario metadata keys. description: Returns a list of all available metadata keys that can be used for filtering scenarios. operationId: getScenarioMetadataKeys parameters: [] responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/MetadataKeysView' deprecated: false /v1/scenarios/metadata/keys/{key}/values: get: tags: - Scenario summary: List values for a specific scenario metadata key. description: Returns a list of all values that exist for a specific metadata key across all scenarios. operationId: getScenarioMetadataValues parameters: - name: key in: path description: The metadata key to get values for. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/MetadataValuesView' '400': description: Invalid metadata key provided. deprecated: false /v1/scenarios/runs: get: tags: - Scenario summary: List ScenarioRuns. description: List all ScenarioRuns matching filter. operationId: listScenarioRuns parameters: - name: name in: query description: Filter by name required: false deprecated: false allowEmptyValue: true schema: type: string - name: state in: query description: Filter by state required: false deprecated: false allowEmptyValue: true schema: type: string - name: benchmark_run_id in: query description: Filter by benchmark run ID required: false deprecated: false allowEmptyValue: true schema: type: string - name: scenario_id in: query description: Filter runs associated to Scenario given ID required: false deprecated: false allowEmptyValue: true schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunListView' deprecated: false /v1/scenarios/runs/{id}: get: tags: - Scenario summary: Get a previously created ScenarioRun. description: Get a ScenarioRun given ID. operationId: getScenarioRun parameters: - name: id in: path description: The ScenarioRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunView' deprecated: false /v1/scenarios/runs/{id}/cancel: post: tags: - Scenario summary: Cancel a Scenario run. description: Cancel a currently running Scenario run. This will shutdown the underlying Devbox resource. operationId: cancelScenarioRun parameters: - name: id in: path description: The Scenario Run ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunView' deprecated: false /v1/scenarios/runs/{id}/complete: post: tags: - Scenario summary: Complete a ScenarioRun. description: Complete a currently running ScenarioRun. Calling complete will shutdown underlying Devbox resource. operationId: completeScenarioRun parameters: - name: id in: path description: The ScenarioRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunView' deprecated: false /v1/scenarios/runs/{id}/download_logs: post: tags: - Scenario summary: Download logs for a Scenario run. description: Download a zip file containing all logs for a Scenario run from the associated devbox. operationId: downloadScenarioRunLogs parameters: - name: id in: path description: The ScenarioRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/zip: schema: format: binary headers: Content-Type: description: application/zip required: true schema: type: string Content-Disposition: description: attachment; filename="scenario_run_logs.zip" required: true schema: type: string deprecated: false /v1/scenarios/runs/{id}/score: post: tags: - Scenario summary: Score a ScenarioRun. description: Score a currently running ScenarioRun. operationId: scoreScenarioRun parameters: - name: id in: path description: The ScenarioRun ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunView' deprecated: false /v1/scenarios/scorers: post: tags: - Scenario - ScenarioScorer summary: Create a custom scenario scorer. description: Create a custom scenario scorer. operationId: createCustomScorer parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/CreateCustomScorerParams' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioScorerView' '400': description: Custom scorer type already exists. deprecated: false get: tags: - Scenario - ScenarioScorer summary: List Scenario Scorers. description: List all Scenario Scorers matching filter. operationId: listScenarioScorers parameters: - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioScorersListView' deprecated: false /v1/scenarios/scorers/{id}: post: tags: - Scenario - ScenarioScorer summary: Update a custom scenario scorer. description: Update a scenario scorer. operationId: updateCustomScorer parameters: - name: id in: path description: The Scorer ID. required: true deprecated: false allowEmptyValue: false schema: type: string requestBody: content: application/json: schema: $ref: '#/components/schemas/CreateCustomScorerParams' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioScorerView' deprecated: false get: tags: - Scenario - ScenarioScorer summary: Retrieve Scenario Scorer. description: Retrieve Scenario Scorer. operationId: retrieveScenarioScorers parameters: - name: id in: path description: The Scorer ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioScorerView' deprecated: false /v1/scenarios/start_run: post: tags: - Scenario summary: Start a new ScenarioRun. description: Start a new ScenarioRun based on the provided Scenario. operationId: startScenarioRun parameters: [] requestBody: content: application/json: schema: $ref: '#/components/schemas/StartScenarioRunParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioRunView' deprecated: false /v1/scenarios/{id}: post: tags: - Scenario summary: Update a Scenario. description: Update a Scenario. Fields that are null will preserve the existing value. Fields that are provided (including empty values) will replace the existing value entirely. operationId: updateScenario parameters: - name: id in: path description: The Scenario ID. required: true deprecated: false allowEmptyValue: false schema: type: string requestBody: content: application/json: schema: $ref: '#/components/schemas/ScenarioUpdateParameters' required: false responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' deprecated: false get: tags: - Scenario summary: Get a Scenario. description: Get a previously created scenario. operationId: getScenario parameters: - name: id in: path description: The Scenario ID. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' deprecated: false /v1/scenarios/{id}/archive: post: tags: - Scenario summary: Archive a Scenario. description: Archive a previously created Scenario. The scenario will no longer appear in list endpoints but can still be retrieved by ID. operationId: archiveScenario parameters: - name: id in: path description: The ID of the Scenario to archive. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' '403': description: Cannot archive public scenarios. '404': description: Scenario not found. deprecated: false /v1/scenarios/{id}/runs: get: tags: - Scenario summary: Get the runs for a Scenario. description: Get a previously created scenario. operationId: getScenarioRuns parameters: - name: id in: path description: The Scenario ID. required: true deprecated: false allowEmptyValue: false schema: type: string - name: limit in: query description: The limit of items to return. Default is 20. Max is 5000. required: false deprecated: false allowEmptyValue: true schema: type: integer format: int32 - name: starting_after in: query description: Load the next page of data starting after the item with the given ID. required: false deprecated: false allowEmptyValue: true schema: type: string - name: include_total_count in: query description: If true (default), includes total_count in the response. Set to false to skip the count query for better performance on large datasets. required: false deprecated: false allowEmptyValue: true schema: type: boolean responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' deprecated: false /v1/scenarios/{id}/unarchive: post: tags: - Scenario summary: Unarchive a Scenario. description: Unarchive a previously archived Scenario. The scenario will appear in list endpoints again. operationId: unarchiveScenario parameters: - name: id in: path description: The ID of the Scenario to unarchive. required: true deprecated: false allowEmptyValue: false schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/ScenarioDefinitionView' '403': description: Cannot unarchive public scenarios. '404': description: Scenario not found. deprecated: false components: schemas: AgentMount: type: object additionalProperties: false properties: agent_id: type: string nullable: true description: The ID of the agent to mount. Either agent_id or name must be set. agent_name: type: string nullable: true description: The name of the agent to mount. Returns the most recent agent with a matching name if no agent id string provided. Either agent id or name must be set agent_path: type: string nullable: true description: Path to mount the agent on the Devbox. Required for git and object agents. Use absolute path (e.g., /home/user/agent) auth_token: type: string nullable: true description: Optional auth token for private repositories. Only used for git agents. type: type: string enum: - agent_mount default: agent_mount required: - agent_id - agent_name - type Architecture: type: string enum: - x86_64 - arm64 AstGrepScoringFunction: type: object additionalProperties: false description: AstGrepScoringFunction utilizes structured coach search for scoring. properties: lang: type: string description: The language of the pattern. search_directory: type: string description: The path to search. pattern: type: string description: AST pattern to match. Pattern will be passed to ast-grep using the commandline surround by double quotes ("), so make sure to use proper escaping (for example, \$\$\$). type: type: string enum: - ast_grep_scorer default: ast_grep_scorer required: - search_directory - pattern - type BashScriptScoringFunction: type: object additionalProperties: false description: BashScriptScoringFunction is a scoring function specified by a bash script that will be run in the context of your environment. properties: bash_script: type: string description: A single bash script that sets up the environment, scores, and prints the final score to standard out. Score should be a float between 0.0 and 1.0, and look like "score=[0.0..1.0]. type: type: string enum: - bash_script_scorer default: bash_script_scorer required: - type BrokerMount: type: object additionalProperties: false properties: axon_id: type: string description: The ID of the axon event stream to mount onto the Devbox. protocol: $ref: '#/components/schemas/BrokerMountProtocol' nullable: true description: The protocol used by the broker to deliver events to the agent. agent_binary: type: string nullable: true description: Binary to launch the agent (e.g., 'opencode'). Used by protocols that launch a subprocess (acp, claude_json). working_directory: type: string nullable: true description: Working directory in which to launch the agent binary. Defaults to the home directory if not specified. launch_args: type: array items: type: string nullable: true description: Arguments to pass to the agent command (e.g., ['acp']). Used by protocols that launch a subprocess (acp, claude_json). type: type: string enum: - broker_mount default: broker_mount required: - axon_id - type BrokerMountProtocol: type: string enum: - acp - claude_json BuiltInScoringFunction: oneOf: - $ref: '#/components/schemas/AstGrepScoringFunction' - $ref: '#/components/schemas/BashScriptScoringFunction' - $ref: '#/components/schemas/CommandScoringFunction' - $ref: '#/components/schemas/CustomScoringFunction' - $ref: '#/components/schemas/PythonScriptScoringFunction' - $ref: '#/components/schemas/TestBasedScoringFunction' discriminator: propertyName: type mapping: ast_grep_scorer: '#/components/schemas/AstGrepScoringFunction' bash_script_scorer: '#/components/schemas/BashScriptScoringFunction' command_scorer: '#/components/schemas/CommandScoringFunction' custom_scorer: '#/components/schemas/CustomScoringFunction' python_script_scorer: '#/components/schemas/PythonScriptScoringFunction' test_based_scorer: '#/components/schemas/TestBasedScoringFunction' CodeMount: type: object additionalProperties: false properties: repo_name: type: string description: The name of the repo to mount. By default, code will be mounted at /home/user/{repo_name}. repo_owner: type: string description: The owner of the repo. install_command: type: string nullable: true description: Installation command to install and setup repository. git_ref: type: string nullable: true description: Optional git ref (branch or tag) to checkout. Defaults to the repository default branch. token: type: string nullable: true description: The authentication token necessary to pull repo. type: type: string enum: - code_mount default: code_mount required: - repo_name - repo_owner - type CommandScoringFunction: type: object additionalProperties: false description: CommandScoringFunction executes a single command and checks the result.The output of the command will be printed. Scoring will passed if the command returns status code 0, otherwise it will be failed. properties: command: type: string description: The command to execute. type: type: string enum: - command_scorer default: command_scorer required: - type CreateCustomScorerParams: type: object additionalProperties: false properties: type: type: string description: Name of the type of custom scorer. bash_script: type: string description: Bash script for the custom scorer taking context as a json object $RL_SCORER_CONTEXT. required: - type - bash_script CustomScoringFunction: type: object additionalProperties: false description: CustomScoringFunction is a custom, user defined scoring function. properties: custom_scorer_type: type: string description: Type of the scoring function, previously registered with Runloop. scorer_params: type: object nullable: true description: Additional JSON structured context to pass to the scoring function. type: type: string enum: - custom_scorer default: custom_scorer required: - custom_scorer_type - type FileMount: type: object additionalProperties: false properties: target: type: string description: Target path where the file should be mounted. content: type: string description: Content of the file to mount. type: type: string enum: - file_mount default: file_mount required: - target - content - type IdleAction: type: string enum: - shutdown - suspend description: 'Action to take after Devbox idle timer is triggered. shutdown: Shutdown the Devbox. suspend: Suspend the Devbox. ' x-enum-descriptions: shutdown: Shutdown the Devbox. suspend: Suspend the Devbox. IdleConfigurationParameters: type: object additionalProperties: false properties: idle_time_seconds: type: integer format: int32 description: After idle_time_seconds, on_idle action will be taken. on_idle: $ref: '#/components/schemas/IdleAction' description: Action to take after Devbox becomes idle. required: - idle_time_seconds - on_idle InputContext: type: object additionalProperties: false description: InputContextView specifies the problem statement along with all additional context for a Scenario. properties: problem_statement: type: string description: The problem statement for the Scenario. additional_context: type: object nullable: true description: Additional JSON structured input context. required: - problem_statement InputContextUpdate: type: object additionalProperties: false properties: problem_statement: type: string nullable: true description: The problem statement for the Scenario. additional_context: type: object nullable: true description: Additional JSON structured input context. LaunchParameters: type: object additionalProperties: false description: LaunchParameters enable you to customize the resources available to your Devbox as well as the environment set up that should be completed before the Devbox is marked as 'running'. properties: launch_commands: type: array items: type: string nullable: true description: Set of commands to be run at launch time, before the entrypoint process is run. resource_size_request: $ref: '#/components/schemas/ResourceSize' nullable: true description: 'Preset Devbox resources (vCPU, RAM in GiB, ephemeral disk in GiB). If not set, SMALL is used. X_SMALL: 0.5 vCPU, 1 GiB RAM, 4 GiB disk. SMALL: 1 vCPU, 2 GiB RAM, 4 GiB disk. MEDIUM: 2 vCPU, 4 GiB RAM, 8 GiB disk. LARGE: 2 vCPU, 8 GiB RAM, 16 GiB disk. X_LARGE: 4 vCPU, 16 GiB RAM, 16 GiB disk. XX_LARGE: 8 vCPU, 32 GiB RAM, 16 GiB disk. CUSTOM_SIZE: set custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size.' available_ports: type: array items: type: integer format: int32 nullable: true description: '[Deprecated] A list of ports to make available on the Devbox. This field is ignored.' keep_alive_time_seconds: type: integer format: int64 nullable: true description: Time in seconds after which Devbox will automatically shutdown. Default is 1 hour. Maximum is 48 hours (172800 seconds). after_idle: $ref: '#/components/schemas/IdleConfigurationParameters' nullable: true description: Configure Devbox lifecycle based on idle activity. If after_idle is set, Devbox will ignore keep_alive_time_seconds. If both after_idle and lifecycle.after_idle are set, they must have the same value. Use lifecycle.after_idle instead. custom_cpu_cores: type: integer format: int32 nullable: true description: Custom CPU cores. Must be 0.5, 1, or a multiple of 2. Max is 16. custom_gb_memory: type: integer format: int32 nullable: true description: Custom memory size in GiB. Must be 1 or a multiple of 2. Max is 64GiB. custom_disk_size: type: integer format: int32 nullable: true description: Custom disk size in GiB. Must be a multiple of 2. Min is 2GiB, max is 64GiB. architecture: $ref: '#/components/schemas/Architecture' nullable: true description: The target architecture for the Devbox. If unset, defaults to x86_64. user_parameters: $ref: '#/components/schemas/UserParameters' nullable: true description: Specify the user for execution on Devbox. If not set, default `user` will be used. required_services: type: array items: type: string nullable: true description: A list of ContainerizedService names to be started when a Devbox is created. A valid ContainerizedService must be specified in Blueprint to be started. network_policy_id: type: string nullable: true description: (Optional) ID of the network policy to apply to Devboxes launched with these parameters. When set on a Blueprint launch parameters, Devboxes created from it will inherit this policy unless explicitly overridden. lifecycle: $ref: '#/components/schemas/LifecycleConfigurationParameters' nullable: true description: Lifecycle configuration for idle and resume behavior. Configure idle policy via lifecycle.after_idle (if both this and the top-level after_idle are set, they must match) and resume triggers via lifecycle.resume_triggers. LifecycleConfigurationParameters: type: object additionalProperties: false description: Lifecycle configuration for Devbox idle and resume behavior. Configure idle policy via after_idle and resume triggers via resume_triggers. properties: after_idle: $ref: '#/components/schemas/IdleConfigurationParameters' nullable: true description: Configure Devbox lifecycle based on idle activity. If both this and the top-level after_idle are set, they must have the same value. Prefer this field for new integrations. resume_triggers: $ref: '#/components/schemas/ResumeTriggers' nullable: true description: Triggers that can resume a suspended Devbox. MetadataKeysView: type: object additionalProperties: false properties: keys: type: array items: type: string MetadataValuesView: type: object additionalProperties: false properties: key: type: string values: type: array items: type: string Mount: oneOf: - $ref: '#/components/schemas/ObjectMount' - $ref: '#/components/schemas/AgentMount' - $ref: '#/components/schemas/CodeMount' - $ref: '#/components/schemas/FileMount' - $ref: '#/components/schemas/BrokerMount' discriminator: propertyName: type mapping: object_mount: '#/components/schemas/ObjectMount' agent_mount: '#/components/schemas/AgentMount' code_mount: '#/components/schemas/CodeMount' file_mount: '#/components/schemas/FileMount' broker_mount: '#/components/schemas/BrokerMount' ObjectMount: type: object additionalProperties: false properties: object_id: type: string description: The ID of the object to write. object_path: type: string description: The path to write the object on the Devbox. Use absolute path of object (ie /home/user/object.txt, or directory if archive /home/user/archive_dir) type: type: string enum: - object_mount default: object_mount required: - object_id - object_path - type PythonScriptScoringFunction: type: object additionalProperties: false description: PythonScriptScoringFunction will run a python script in the context of your environment as a ScoringFunction. properties: requirements_contents: type: string nullable: true description: Package dependencies to be installed. The requirements should be a valid requirements.txt file. python_script: type: string description: Python script to be run. The script should output the score to standard out as a float between 0.0 and 1.0. python_version_constraint: type: string nullable: true description: Python version to run scoring. Default is "==3.12.10" type: type: string enum: - python_script_scorer default: python_script_scorer required: - python_script - type ResourceSize: type: string enum: - X_SMALL - SMALL - MEDIUM - LARGE - X_LARGE - XX_LARGE - CUSTOM_SIZE description: 'The size of the Devbox resources for Runloop to allocate. X_SMALL: 0.5 cpu x 1GiB memory x 4GiB disk SMALL: 1 cpu x 2GiB memory x 4GiB disk MEDIUM: 2 cpu x 4GiB memory x 8GiB disk LARGE: 2 cpu x 8GiB memory x 16GiB disk X_LARGE: 4 cpu x 16GiB memory x 16GiB disk XX_LARGE: 8 cpu x 32GiB memory x 16GiB disk CUSTOM_SIZE: To choose a custom size, set this enum and also the custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size in launch parameters. CPU must be 0.5, 1, or a multiple of 2 (max 16). Memory must be 1 or a multiple of 2 (max 64GiB). Disk must be a multiple of 2 (min 2GiB, max 64GiB). The cpu:memory ratio must be between 1:2 and 1:8 inclusive. ' x-enum-descriptions: X_SMALL: 0.5 cpu x 1GiB memory x 4GiB disk SMALL: 1 cpu x 2GiB memory x 4GiB disk MEDIUM: 2 cpu x 4GiB memory x 8GiB disk LARGE: 2 cpu x 8GiB memory x 16GiB disk X_LARGE: 4 cpu x 16GiB memory x 16GiB disk XX_LARGE: 8 cpu x 32GiB memory x 16GiB disk CUSTOM_SIZE: To choose a custom size, set this enum and also the custom_cpu_cores, custom_gb_memory, and optionally custom_disk_size in launch parameters. CPU must be 0.5, 1, or a multiple of 2 (max 16). Memory must be 1 or a multiple of 2 (max 64GiB). Disk must be a multiple of 2 (min 2GiB, max 64GiB). The cpu:memory ratio must be between 1:2 and 1:8 inclusive. ResumeTriggers: type: object additionalProperties: false description: Triggers that can resume a suspended Devbox. properties: http: type: boolean nullable: true description: When true, HTTP traffic to a suspended Devbox via tunnel will trigger a resume. axon_event: type: boolean nullable: true description: When true, axon events targeting a suspended Devbox will trigger a resume. RunProfile: type: object additionalProperties: false properties: purpose: type: string nullable: true description: Purpose of the run. envVars: type: object additionalProperties: type: string nullable: true description: 'Mapping of Environment Variable to Value. May be shown in devbox logging. Example: {"DB_PASS": "DATABASE_PASSWORD"} would set the environment variable ''DB_PASS'' to the value ''DATABASE_PASSWORD_VALUE''.' secrets: type: object additionalProperties: type: string nullable: true description: 'Mapping of Environment Variable to User Secret Name. Never shown in devbox logging. Example: {"DB_PASS": "DATABASE_PASSWORD"} would set the environment variable ''DB_PASS'' to the value of the secret ''DATABASE_PASSWORD''.' launchParameters: $ref: '#/components/schemas/LaunchParameters' nullable: true description: Additional runtime LaunchParameters to apply after the devbox starts. mounts: type: array items: $ref: '#/components/schemas/Mount' nullable: true description: A list of mounts to be included in the scenario run. ScenarioCreateParameters: type: object additionalProperties: false properties: name: type: string description: Name of the scenario. input_context: $ref: '#/components/schemas/InputContext' description: The input context for the Scenario. scoring_contract: $ref: '#/components/schemas/ScoringContract' description: The scoring contract for the Scenario. environment_parameters: $ref: '#/components/schemas/ScenarioEnvironment' nullable: true description: The Environment in which the Scenario will run. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the scenario for organization. reference_output: type: string nullable: true description: A string representation of the reference output to solve the scenario. Commonly can be the result of a git diff or a sequence of command actions to apply to the environment. required_environment_variables: type: array items: type: string nullable: true description: Environment variables required to run the scenario. If these variables are not provided, the scenario will fail to start. required_secret_names: type: array items: type: string nullable: true description: Secrets required to run the scenario (user secret name to scenario required secret name). If these secrets are not provided or the mapping is incorrect, the scenario will fail to start. validation_type: $ref: '#/components/schemas/ValidationType' nullable: true description: Validation strategy. scorer_timeout_sec: type: integer format: int32 nullable: true description: Timeout for scoring in seconds. Default 30 minutes (1800s). required: - name - input_context - scoring_contract ScenarioDefinitionListView: type: object additionalProperties: false properties: scenarios: type: array items: $ref: '#/components/schemas/ScenarioDefinitionView' description: List of Scenarios matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - scenarios - has_more ScenarioDefinitionStatus: type: string enum: - active - archived ScenarioDefinitionView: type: object additionalProperties: false description: A ScenarioDefinitionView represents a repeatable AI coding evaluation test, complete with initial environment and scoring contract. properties: id: type: string description: The ID of the Scenario. name: type: string description: The name of the Scenario. environment: $ref: '#/components/schemas/ScenarioEnvironment' nullable: true description: The Environment in which the Scenario is run. input_context: $ref: '#/components/schemas/InputContext' description: The input context for the Scenario. scoring_contract: $ref: '#/components/schemas/ScoringContract' description: The scoring contract for the Scenario. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the scenario for organization. reference_output: type: string nullable: true description: A string representation of the reference output to solve the scenario. Commonly can be the result of a git diff or a sequence of command actions to apply to the environment. required_environment_variables: type: array items: type: string description: Environment variables required to run the scenario. If any required environment variables are missing, the scenario will fail to start. required_secret_names: type: array items: type: string description: Environment variables required to run the scenario. If any required secrets are missing, the scenario will fail to start. is_public: type: boolean description: Whether this scenario is public. validation_type: $ref: '#/components/schemas/ValidationType' nullable: true description: Validation strategy. scorer_timeout_sec: type: integer format: int32 nullable: true description: Timeout for scoring in seconds. Default 30 minutes (1800s). status: $ref: '#/components/schemas/ScenarioDefinitionStatus' description: Whether the scenario is active or archived. Archived scenarios are excluded from listings and cannot be updated. required: - id - name - input_context - scoring_contract - metadata - status ScenarioEnvironment: type: object additionalProperties: false description: ScenarioEnvironmentParameters specify the environment in which a Scenario will be run. properties: blueprint_id: type: string nullable: true description: Use the blueprint with matching ID. snapshot_id: type: string nullable: true description: Use the snapshot with matching ID. launch_parameters: $ref: '#/components/schemas/LaunchParameters' nullable: true description: Optional launch parameters to apply to the devbox environment at launch. working_directory: type: string nullable: true description: The working directory where the agent is expected to fulfill the scenario. Scoring functions also run from the working directory. ScenarioRunListView: type: object additionalProperties: false properties: runs: type: array items: $ref: '#/components/schemas/ScenarioRunView' description: List of ScenarioRuns matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - runs - has_more ScenarioRunState: type: string enum: - running - scoring - scored - completed - canceled - timeout - failed ScenarioRunView: type: object additionalProperties: false description: A ScenarioRunView represents a single run of a Scenario on a Devbox. When completed, the ScenarioRun will contain the final score and output of the run. properties: id: type: string description: ID of the ScenarioRun. name: type: string nullable: true description: Optional name of ScenarioRun. scenario_id: type: string description: ID of the Scenario that has been run. devbox_id: type: string description: ID of the Devbox on which the Scenario is running. benchmark_run_id: type: string nullable: true description: ID of the BenchmarkRun that this Scenario is associated with, if any. scoring_contract_result: $ref: '#/components/schemas/ScoringContractResultView' nullable: true description: The scoring result of the ScenarioRun. start_time_ms: type: integer format: int64 description: The time that the scenario started duration_ms: type: integer format: int64 nullable: true description: Duration scenario took to run. state: $ref: '#/components/schemas/ScenarioRunState' description: The state of the ScenarioRun. metadata: type: object additionalProperties: type: string description: User defined metadata to attach to the scenario run for organization. purpose: type: string nullable: true description: Purpose of the ScenarioRun. environment_variables: type: object additionalProperties: type: string nullable: true description: Environment variables used to run the scenario. secrets_provided: type: object additionalProperties: type: string nullable: true description: User secrets used to run the scenario. required: - id - scenario_id - devbox_id - state - metadata ScenarioScorerView: type: object additionalProperties: false description: A ScenarioScorerView represents a custom scoring function for a Scenario. properties: id: type: string description: ID for the scenario scorer. type: type: string description: Name of the type of scenario scorer. bash_script: type: string description: Bash script that takes in $RL_SCORER_CONTEXT as env variable and runs scoring. required: - id - type - bash_script ScenarioScorersListView: type: object additionalProperties: false properties: scorers: type: array items: $ref: '#/components/schemas/ScenarioScorerView' description: List of ScenarioScorers matching filter. has_more: type: boolean total_count: type: integer format: int32 nullable: true required: - scorers - has_more ScenarioUpdateParameters: type: object additionalProperties: false description: ScenarioUpdateParameters contain the set of parameters to update a Scenario. All fields are optional - null fields preserve existing values, provided fields replace entirely. properties: name: type: string nullable: true description: Name of the scenario. Cannot be blank. input_context: $ref: '#/components/schemas/InputContextUpdate' nullable: true description: The input context for the Scenario. scoring_contract: $ref: '#/components/schemas/ScoringContractUpdate' nullable: true description: The scoring contract for the Scenario. environment_parameters: $ref: '#/components/schemas/ScenarioEnvironment' nullable: true description: The Environment in which the Scenario will run. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the scenario. Pass in empty map to clear. required_environment_variables: type: array items: type: string nullable: true description: Environment variables required to run the scenario. Pass in empty list to clear. required_secret_names: type: array items: type: string nullable: true description: Secrets required to run the scenario. Pass in empty list to clear. reference_output: type: string nullable: true description: A string representation of the reference output to solve the scenario. Commonly can be the result of a git diff or a sequence of command actions to apply to the environment. Pass in empty string to clear. validation_type: $ref: '#/components/schemas/ValidationType' nullable: true description: Validation strategy. Pass in empty string to clear. scorer_timeout_sec: type: integer format: int32 nullable: true description: Timeout for scoring in seconds. Default 30 minutes (1800s). ScoringContract: type: object additionalProperties: false description: InputContextView specifies the problem statement along with all additional context for a Scenario. properties: scoring_function_parameters: type: array items: $ref: '#/components/schemas/ScoringFunction' description: A list of scoring functions used to evaluate the Scenario. required: - scoring_function_parameters ScoringContractResultView: type: object additionalProperties: false description: A ScoringContractResultView represents the result of running all scoring functions on a given input context. properties: score: type: number format: float description: Total score for all scoring contracts. This will be a value between 0 and 1. scoring_function_results: type: array items: $ref: '#/components/schemas/ScoringFunctionResultView' description: List of all individual scoring function results. required: - score - scoring_function_results ScoringContractUpdate: type: object additionalProperties: false properties: scoring_function_parameters: type: array items: $ref: '#/components/schemas/ScoringFunction' nullable: true description: A list of scoring functions used to evaluate the Scenario. ScoringFunction: type: object additionalProperties: false description: ScoringFunction specifies a method of scoring a Scenario. properties: name: type: string description: Name of scoring function. Names must only contain [a-zA-Z0-9_-]. scorer: $ref: '#/components/schemas/BuiltInScoringFunction' description: The scoring function to use for evaluating this scenario. The type field determines which built-in function to use. weight: type: number format: float description: Weight to apply to scoring function score. Weights of all scoring functions should sum to 1.0. required: - name - scorer - weight ScoringFunctionResultView: type: object additionalProperties: false description: A ScoringFunctionResultView represents the result of running a single scoring function on a given input context. properties: score: type: number format: float description: Final score for the given scoring function. scoring_function_name: type: string description: Scoring function name that ran. output: type: string description: Log output of the scoring function. state: $ref: '#/components/schemas/ScoringFunctionResultViewState' description: The state of the scoring function application. required: - score - scoring_function_name - output - state ScoringFunctionResultViewState: type: string enum: - unknown - complete - error StartScenarioRunParameters: type: object additionalProperties: false properties: scenario_id: type: string description: ID of the Scenario to run. run_name: type: string nullable: true description: Display name of the run. benchmark_run_id: type: string nullable: true description: Benchmark to associate the run. metadata: type: object additionalProperties: type: string nullable: true description: User defined metadata to attach to the run for organization. runProfile: $ref: '#/components/schemas/RunProfile' nullable: true description: Runtime configuration to use for this benchmark run required: - scenario_id TestBasedScoringFunction: type: object additionalProperties: false description: TestBasedScoringFunction writes test files to disk and executes a test command to verify the solution. properties: test_files: type: array items: $ref: '#/components/schemas/TestFile' description: List of test files to create test_command: type: string description: The command to execute for running the tests type: type: string enum: - test_based_scorer default: test_based_scorer required: - type TestFile: type: object additionalProperties: false properties: file_path: type: string description: Path to write content of the test file, relative to your environment's working directory file_contents: type: string description: Content of the test file UserParameters: type: object additionalProperties: false description: Configuration for the Linux user in the Devbox environment. properties: username: type: string description: Username for the Linux user. uid: type: integer format: int32 description: User ID (UID) for the Linux user. Must be a non-negative integer. required: - username - uid ValidationType: type: string enum: - UNSPECIFIED - FORWARD - REVERSE - EVALUATION securitySchemes: bearerAuth: scheme: bearer type: http security: - bearerAuth: []