arazzo: 1.0.1
info:
  title: Runloop Create Scenario and Run It
  summary: Define a repeatable AI coding evaluation scenario, start a run of it, poll until it is scored, then complete the run.
  description: >-
    Scenarios are repeatable AI coding evaluations that pair a starting
    environment with a scoring contract. This workflow creates a scenario with a
    problem statement and a bash script scorer, starts a scenario run on a fresh
    devbox, polls the run until it reaches the scored state (looping while it is
    running or scoring, and ending the flow if it fails, times out, or is
    canceled), then completes the run to release the underlying devbox. Every
    step spells out its request inline so the flow can be read and executed
    without opening the underlying OpenAPI description.
  version: 1.0.0
sourceDescriptions:
- name: scenarioApi
  url: ../openapi/runloop-scenario-api-openapi.yml
  type: openapi
workflows:
- workflowId: create-scenario-and-run
  summary: Create a scenario, run it, wait for scoring, and complete the run.
  description: >-
    Creates a scenario with a scoring contract, starts a run, polls until the run
    is scored, then completes the run.
  inputs:
    type: object
    required:
    - apiToken
    - scenarioName
    - problemStatement
    - scorerName
    - bashScript
    properties:
      apiToken:
        type: string
        description: Runloop API bearer token.
      scenarioName:
        type: string
        description: Name to give the scenario.
      problemStatement:
        type: string
        description: The problem statement the scenario asks the agent to solve.
      scorerName:
        type: string
        description: Name of the scoring function (only [a-zA-Z0-9_-] allowed).
      bashScript:
        type: string
        description: Bash script that scores the run and prints score=[0.0..1.0] to stdout.
  steps:
  - stepId: createScenario
    description: >-
      Create a scenario with the supplied problem statement and a single bash
      script scorer weighted at 1.0.
    operationId: createScenario
    parameters:
    - name: Authorization
      in: header
      value: Bearer $inputs.apiToken
    requestBody:
      contentType: application/json
      payload:
        name: $inputs.scenarioName
        input_context:
          problem_statement: $inputs.problemStatement
        scoring_contract:
          scoring_function_parameters:
          - name: $inputs.scorerName
            weight: 1.0
            scorer:
              type: bash_script_scorer
              bash_script: $inputs.bashScript
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      scenarioId: $response.body#/id
  - stepId: startRun
    description: Start a scenario run, which provisions a devbox for the evaluation.
    operationId: startScenarioRun
    parameters:
    - name: Authorization
      in: header
      value: Bearer $inputs.apiToken
    requestBody:
      contentType: application/json
      payload:
        scenario_id: $steps.createScenario.outputs.scenarioId
        run_name: $inputs.scenarioName
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      runId: $response.body#/id
      devboxId: $response.body#/devbox_id
      state: $response.body#/state
  - stepId: pollRun
    description: >-
      Poll the scenario run until it reaches the scored state, looping while it
      is running or scoring, and ending the flow on failure, timeout, or cancel.
    operationId: getScenarioRun
    parameters:
    - name: Authorization
      in: header
      value: Bearer $inputs.apiToken
    - name: id
      in: path
      value: $steps.startRun.outputs.runId
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      state: $response.body#/state
      score: $response.body#/scoring_contract_result/score
    onSuccess:
    - name: scored
      type: goto
      stepId: completeRun
      criteria:
      - context: $response.body
        condition: $.state == "scored"
        type: jsonpath
    - name: inProgress
      type: goto
      stepId: pollRun
      criteria:
      - context: $response.body
        condition: $.state == "running" || $.state == "scoring"
        type: jsonpath
    - name: runEnded
      type: end
      criteria:
      - context: $response.body
        condition: $.state == "failed" || $.state == "timeout" || $.state == "canceled"
        type: jsonpath
  - stepId: completeRun
    description: Complete the scored run, shutting down the underlying devbox.
    operationId: completeScenarioRun
    parameters:
    - name: Authorization
      in: header
      value: Bearer $inputs.apiToken
    - name: id
      in: path
      value: $steps.startRun.outputs.runId
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      finalState: $response.body#/state
      finalScore: $response.body#/scoring_contract_result/score
  outputs:
    scenarioId: $steps.createScenario.outputs.scenarioId
    runId: $steps.startRun.outputs.runId
    finalState: $steps.completeRun.outputs.finalState
    finalScore: $steps.completeRun.outputs.finalScore