#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: scripts/validate-wannabuild-dry-runs.sh [repo_root] Validates the host-neutral daily-use trust harness: - Claude Code and Codex invocation contract - dry-run scenario coverage and fixture expectations - golden path demo artifacts - summary gate blocking behavior for failed review/QA evidence Exit code 0 = valid, non-zero = trust contract violation. USAGE } if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then usage exit 0 fi ROOT="${1:-$(cd "$(dirname "$0")/.." && pwd)}" ROOT="$(cd "$ROOT" && pwd)" DRY_RUN_DIR="$ROOT/skills/internal/build/dry-runs" MANIFEST="$DRY_RUN_DIR/daily-use-trust-scenarios.json" GOLDEN_ROOT="$ROOT/docs/golden-path-demo" ARTIFACT_VALIDATOR="$ROOT/scripts/validate-wannabuild-artifacts.sh" GATE_CHECK="$ROOT/scripts/wannabuild-gate-check.sh" status=0 pass() { printf 'PASS %s\n' "$1" } fail() { printf 'FAIL %s\n' "$1" status=1 } require_file() { local path="$1" if [[ -f "$path" ]]; then pass "${path#$ROOT/}" else fail "${path#$ROOT/}" fi } run_expect_success() { local label="$1" shift local output if output="$("$@" 2>&1)"; then pass "$label" else fail "$label" printf '%s\n' "$output" fi } run_expect_failure() { local label="$1" shift local output if output="$("$@" 2>&1)"; then fail "$label" printf 'Expected failure but command passed.\n' else pass "$label" fi } require_file "$MANIFEST" require_file "$ARTIFACT_VALIDATOR" require_file "$GATE_CHECK" if ! command -v python3 >/dev/null 2>&1; then fail "python3 available for dry-run validation" else pass "python3 available for dry-run validation" fi if [[ $status -eq 0 ]]; then if python3 - "$MANIFEST" "$DRY_RUN_DIR" <<'PY'; then import json from pathlib import Path import sys manifest_path = Path(sys.argv[1]) dry_run_dir = Path(sys.argv[2]) errors = [] def err(message): errors.append(message) def load_json(path): try: return json.loads(path.read_text(encoding="utf-8")) except Exception as exc: err(f"{path.name}: invalid JSON ({exc})") return None def get_value(payload, dotted): current = payload for part in dotted.split("."): if isinstance(current, dict) and part in current: current = current[part] else: return None return current manifest = load_json(manifest_path) if manifest is None: raise SystemExit(1) expected_hosts = { "claude_code": {"start": "/wannabuild", "intro": "/using-wannabuild"}, "codex": {"start": "$wannabuild", "intro": "$using-wannabuild"}, } if manifest.get("schema_version") != 1: err("schema_version must be 1") if manifest.get("host_invocations") != expected_hosts: err("host_invocations must preserve /wannabuild, /using-wannabuild, $wannabuild, and $using-wannabuild") required = { "no-task-invocation", "exploratory-discovery-invocation", "wb-discover-entrypoint-full-loop", "vague-acknowledgment-no-phase-skip", "runtime-plan-gate", "implementation-workspace-selection", "resume", "research-gate", "implementation-selection", "review-failure", "qa-failure", "summary-completion", } declared_required = set(manifest.get("required_scenarios", [])) if declared_required != required: err("required_scenarios must exactly match the daily-use trust scenario set") scenarios = manifest.get("scenarios") if not isinstance(scenarios, list): err("scenarios must be a list") scenarios = [] seen = set() for scenario in scenarios: if not isinstance(scenario, dict): err("scenario entries must be objects") continue sid = scenario.get("id") if not isinstance(sid, str) or not sid: err("scenario missing non-empty id") continue if sid in seen: err(f"duplicate scenario id: {sid}") seen.add(sid) if not scenario.get("description"): err(f"{sid}: missing description") fixture = scenario.get("fixture") payload = None if fixture: fixture_path = dry_run_dir / fixture if not fixture_path.is_file(): err(f"{sid}: missing fixture {fixture}") else: payload = load_json(fixture_path) for contract in [scenario, payload if isinstance(payload, dict) else None]: if isinstance(contract, dict) and "expected_banner" in contract: err(f"{sid}: must not require a visible machine banner") if sid == "no-task-invocation": no_task_contract = payload if isinstance(payload, dict) else scenario if no_task_contract.get("prompt_has_concrete_task") is not False: err("no-task-invocation must explicitly model a missing concrete task") if no_task_contract.get("next_prompt") != "Tell me what you want to build or change.": err("no-task-invocation next prompt changed") forbidden = set(no_task_contract.get("must_not", [])) for item in {"inspect_repo", "infer_git_diff", "plan", "implement"}: if item not in forbidden: err(f"no-task-invocation must forbid {item}") if sid == "exploratory-discovery-invocation": discovery_contract = payload if isinstance(payload, dict) else scenario if discovery_contract.get("prompt_has_concrete_task") is not True: err("exploratory-discovery-invocation must be concrete enough to start Discover") if discovery_contract.get("prompt_kind") != "exploratory_build_intent": err("exploratory-discovery-invocation prompt_kind must be exploratory_build_intent") if discovery_contract.get("expected_skill") != "wannabuild": err("exploratory-discovery-invocation must use the full wannabuild skill") if discovery_contract.get("expected_loop") != "full": err("exploratory-discovery-invocation must continue through the full loop after Discover") if discovery_contract.get("expected_public_step") != "discover": err("exploratory-discovery-invocation must route to Discover") if discovery_contract.get("expected_next_action") != "start_discovery_interview": err("exploratory-discovery-invocation must start the discovery interview") forbidden = set(discovery_contract.get("must_not", [])) for item in {"no_task_fallback", "ask_user_to_invoke_skill", "plan_before_discovery", "implement_before_plan"}: if item not in forbidden: err(f"exploratory-discovery-invocation must forbid {item}") examples = discovery_contract.get("prompt_examples", []) if not isinstance(examples, list) or not examples: err("exploratory-discovery-invocation must include prompt_examples") else: joined = " ".join(str(item).lower() for item in examples) for phrase in ["work on this", "brainstorm", "what should we add"]: if phrase not in joined: err(f"exploratory-discovery-invocation examples must cover {phrase!r}") if sid == "wb-discover-entrypoint-full-loop": entry_contract = payload if isinstance(payload, dict) else scenario if entry_contract.get("prompt_has_concrete_task") is not True: err("wb-discover-entrypoint-full-loop must model a concrete change request") if entry_contract.get("invoked_skill") != "wb-discover": err("wb-discover-entrypoint-full-loop must model wb-discover as the invoked entrypoint") if entry_contract.get("expected_loop") != "full": err("wb-discover-entrypoint-full-loop must continue the full loop") if entry_contract.get("expected_public_step") != "discover": err("wb-discover-entrypoint-full-loop must start at Discover") if entry_contract.get("expected_next_action") != "produce_requirements_then_plan": err("wb-discover-entrypoint-full-loop must transition from requirements to planning") progression = entry_contract.get("expected_phase_progression", []) for stage in ["discover", "plan", "implement", "review", "qa", "ship", "summary"]: if stage not in progression: err(f"wb-discover-entrypoint-full-loop must include {stage!r} in expected_phase_progression") forbidden = set(entry_contract.get("must_not", [])) for item in {"stop_after_discovery", "ask_user_to_invoke_wb_plan", "ask_user_to_invoke_wb_build", "generic_codex_implementation", "implement_before_plan"}: if item not in forbidden: err(f"wb-discover-entrypoint-full-loop must forbid {item}") if sid == "vague-acknowledgment-no-phase-skip": ack_contract = payload if isinstance(payload, dict) else scenario if ack_contract.get("active_workflow") is not True: err("vague-acknowledgment-no-phase-skip must model an active workflow") if ack_contract.get("workflow_status") != "in_progress": err("vague-acknowledgment-no-phase-skip must keep workflow_status in_progress") if ack_contract.get("expected_next_action") != "continue_current_phase": err("vague-acknowledgment-no-phase-skip must continue the current phase") if ack_contract.get("phase_limit_requires_explicit_user_text") is not True: err("vague-acknowledgment-no-phase-skip must require explicit text for phase limits") examples = ack_contract.get("prompt_examples", []) joined = " ".join(str(item).lower() for item in examples) for phrase in ["ok", "uh ok"]: if phrase not in joined: err(f"vague-acknowledgment-no-phase-skip examples must cover {phrase!r}") forbidden = set(ack_contract.get("must_not", [])) for item in {"treat_as_approval_to_skip_plan", "start_implementation_without_plan", "stop_workflow", "ask_user_to_invoke_next_phase"}: if item not in forbidden: err(f"vague-acknowledgment-no-phase-skip must forbid {item}") if sid == "runtime-plan-gate": gate_contract = payload if isinstance(payload, dict) else scenario if gate_contract.get("active_workflow") is not True: err("runtime-plan-gate must model an active workflow") if gate_contract.get("workflow_status") != "in_progress": err("runtime-plan-gate must keep workflow_status in_progress") if gate_contract.get("public_stage") != "plan": err("runtime-plan-gate must model the plan stage") if gate_contract.get("expected_runtime_context") is not True: err("runtime-plan-gate must require runtime context injection") if gate_contract.get("expected_next_action") != "complete_plan_before_implementation": err("runtime-plan-gate must require Plan completion before implementation") if gate_contract.get("required_gate_command") != "scripts/wannabuild-session.sh assert-plan-ready ": err("runtime-plan-gate must require the assert-plan-ready gate command") forbidden = set(gate_contract.get("must_not", [])) for item in {"implement_before_plan", "edit_code_before_plan_gate", "treat_vague_ack_as_approval", "generic_codex_implementation"}: if item not in forbidden: err(f"runtime-plan-gate must forbid {item}") workspace_fixture = scenario.get("workspace_fixture") if workspace_fixture: workspace_path = dry_run_dir / workspace_fixture workspace = load_json(workspace_path) if workspace_path.is_file() else None if workspace is None: err(f"{sid}: missing workspace fixture {workspace_fixture}") else: branch_name = workspace.get("branch_name", "") prefixes = tuple(scenario.get("branch_prefixes", [])) if not prefixes: err(f"{sid}: workspace scenario must declare accepted branch_prefixes") elif not isinstance(branch_name, str) or not branch_name.startswith(prefixes): err(f"{sid}: workspace branch {branch_name!r} does not use an accepted host branch prefix") for key in ["workspace_id", "source_repo", "source_branch", "workspace_path", "branch_name", "dirty_snapshot"]: if key not in workspace: err(f"{sid}: workspace fixture missing {key}") source_repo = workspace.get("source_repo") isolated_path = workspace.get("workspace_path") if not isinstance(source_repo, str) or not source_repo: err(f"{sid}: workspace source_repo must be a non-empty string") if not isinstance(isolated_path, str) or not isolated_path: err(f"{sid}: workspace workspace_path must be a non-empty string") elif "/.codex/worktrees/" not in isolated_path: err(f"{sid}: workspace_path must live under a .codex/worktrees root") if isinstance(source_repo, str) and isinstance(isolated_path, str) and source_repo == isolated_path: err(f"{sid}: workspace_path must differ from source_repo") if not isinstance(workspace.get("dirty_snapshot"), bool): err(f"{sid}: workspace dirty_snapshot must be boolean") if payload is not None: for key, expected in scenario.get("expect", {}).items(): actual = get_value(payload, key) if actual != expected: err(f"{sid}: expected {key}={expected!r}, got {actual!r}") history = scenario.get("required_public_history", []) if history: records = payload.get("public_stage_history", []) stages = {item.get("stage") for item in records if isinstance(item, dict)} for stage in history: if stage not in stages: err(f"{sid}: missing public_stage_history stage {stage!r}") public_stage_expect = scenario.get("latest_public_stage_expect", {}) if public_stage_expect: records = payload.get("public_stage_history", []) if not isinstance(records, list) or not records: err(f"{sid}: missing public_stage_history records") else: stage = public_stage_expect.get("stage") candidates = [item for item in records if isinstance(item, dict) and (not stage or item.get("stage") == stage)] if not candidates: err(f"{sid}: missing public_stage_history record for {stage!r}") else: latest_record = candidates[-1] for key, expected in public_stage_expect.items(): actual = latest_record.get(key) if actual != expected: err(f"{sid}: expected latest public stage {key}={expected!r}, got {actual!r}") latest_expect = scenario.get("latest_iteration_expect", {}) if latest_expect: iterations = payload.get("iterations") if not isinstance(iterations, list) or not iterations: err(f"{sid}: missing loop iterations") else: latest = max(iterations, key=lambda item: item.get("iteration", 0)) for key, expected in latest_expect.items(): actual = get_value(latest, key) if actual != expected: err(f"{sid}: expected latest iteration {key}={expected!r}, got {actual!r}") coverage = seen & required missing = sorted(required - coverage) if missing: err("missing required scenario coverage: " + ", ".join(missing)) if errors: print("Daily-use trust dry-run validation failed:") for message in errors: print(f" - {message}") raise SystemExit(1) print(f"Validated {len(scenarios)} daily-use trust scenario(s).") PY pass "dry-run scenario manifest" else fail "dry-run scenario manifest" fi fi if [[ -d "$GOLDEN_ROOT" ]]; then run_expect_success "golden path artifact validation" "$ARTIFACT_VALIDATOR" "$GOLDEN_ROOT" document # The committed demo carries fixture verdicts, not machine-local signed # evidence, so its gate check runs in explicit (and loudly labeled) fixture mode. run_expect_success "golden path summary gate (fixture evidence mode)" \ env WB_EVIDENCE_MODE=fixture "$GATE_CHECK" "$GOLDEN_ROOT" summary else fail "docs/golden-path-demo/" fi if [[ $status -eq 0 ]]; then tmp="$(mktemp -d)" trap 'rm -rf "$tmp"' EXIT cp -R "$GOLDEN_ROOT/.wannabuild" "$tmp/.wannabuild" rm -f "$tmp/.wannabuild/outputs/qa-summary.md" run_expect_failure "summary gate blocks missing QA summary" "$GATE_CHECK" "$tmp" summary rm -rf "$tmp/.wannabuild" mkdir -p "$tmp/.wannabuild/review" "$tmp/.wannabuild/outputs" cp "$DRY_RUN_DIR/review-failure-loop.json" "$tmp/.wannabuild/loop-state.json" printf 'QA completed, but review remains blocked.\n' >"$tmp/.wannabuild/outputs/qa-summary.md" python3 - "$tmp/.wannabuild/loop-state.json" "$tmp/.wannabuild/review" <<'PY' import json from pathlib import Path import sys loop = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8")) review_dir = Path(sys.argv[2]) latest = max(loop["iterations"], key=lambda item: item.get("iteration", 0)) for reviewer, verdict in latest["verdicts"].items(): path = review_dir / f"{reviewer}-iter-{latest['iteration']}.json" path.write_text(json.dumps(verdict, indent=2) + "\n", encoding="utf-8") PY run_expect_failure "summary gate blocks failed reviewer verdict" "$GATE_CHECK" "$tmp" summary rm -rf "$tmp/.wannabuild" mkdir -p "$tmp/.wannabuild/review" "$tmp/.wannabuild/outputs" cp "$DRY_RUN_DIR/qa-failure-remediation-loop.json" "$tmp/.wannabuild/loop-state.json" printf 'QA attempted; integration hard gate remains failing.\n' >"$tmp/.wannabuild/outputs/qa-summary.md" python3 - "$tmp/.wannabuild/loop-state.json" "$tmp/.wannabuild/review/wb-integration-tester-iter-2.json" <<'PY' import json from pathlib import Path import sys loop = json.loads(Path(sys.argv[1]).read_text(encoding="utf-8")) latest = max(loop["iterations"], key=lambda item: item.get("iteration", 0)) verdict = latest["verdicts"]["wb-integration-tester"] Path(sys.argv[2]).write_text(json.dumps(verdict, indent=2) + "\n", encoding="utf-8") PY run_expect_failure "summary gate blocks failed integration verdict" "$GATE_CHECK" "$tmp" summary # Red-team scenario: a complete, plausible set of hand-written PASS verdicts # (the golden-path fixtures themselves) must NOT pass outside fixture mode, # because no runtime-recorded execution evidence exists for them. rm -rf "$tmp/.wannabuild" cp -R "$GOLDEN_ROOT/.wannabuild" "$tmp/.wannabuild" run_expect_failure "summary gate blocks forged verdicts without runtime evidence" \ env -u WB_EVIDENCE_MODE "$GATE_CHECK" "$tmp" summary fi if [[ $status -eq 0 ]]; then echo "Daily-use trust dry runs passed." else echo "Daily-use trust dry runs failed." fi exit "$status"