#!/usr/bin/env bash # swarm/supervise.sh — resilience wrapper around agent.sh (ADR-017, # SPEC-017-A). # # The agent loop is deliberately fail-fast: it exits 3 on an infrastructure # failure (ADR-016), and exits 0 on an empty pool even when the tree is still # in flight (open PRs awaiting gates, blocked parents awaiting the unblock # sweep). This wrapper owns "keep the tree moving", so an outage costs # wall-clock instead of maintainer attention: # # agent exit 3 (infra) → exponential backoff (base 300s, ×2 per # consecutive infra failure, cap 3600s), retry # agent exit 1 (cycle fail) → short fixed backoff (120s), retry # agent exit 0, scope open → in-flight wait (180s): PRs merge, sweeps # unblock parents, then re-run # agent exit 0, scope closed → done — every goal in the scope is proved # agent exit 2 (config) → fatal: a human must fix the environment # # Every wait also runs scope-limited PR hygiene (best-effort, never fatal): # - duplicate open prove PRs on one goal (the claim-race symptom, #184/#185) # → close all but the oldest, with a comment; # - an open PR sitting CONFLICTING → loud log line. GitHub runs no checks on # a conflicted PR, so an armed auto-merge waits forever in silence (the # #166 failure mode) — surfacing it is the fix's first half; the resolve # stays with the maintainer (records can need semantic resolution). # # Usage: # ./swarm/supervise.sh --prove --goal [other agent.sh args] # ./swarm/supervise.sh --self-test # # Environment (beyond agent.sh's own): # UNSORRY_SUP_MAX_RUNS max agent invocations before giving up (default 50) # UNSORRY_SUP_BASE_BACKOFF infra backoff base seconds (default 300) # UNSORRY_SUP_MAX_BACKOFF infra backoff cap seconds (default 3600) # UNSORRY_SUP_SHORT_WAIT cycle-failure backoff seconds (default 120) # UNSORRY_SUP_FLIGHT_WAIT in-flight wait seconds (default 180) # # Exit codes: 0 scope closed (or unscoped pool empty) · 1 run budget exhausted # · 2 configuration error (own or agent's) · 130 interrupted. set -euo pipefail log() { printf '[supervise.sh] %s\n' "$*" >&2; } # ---------------------------------------------------------- pure decisions # next_action # Prints " ": done-check 0 | fatal 0 | backoff N | retry N. # Pure — the supervisor's whole policy lives here so it is testable. next_action() { local rc="$1" consec="$2" base="$3" cap="$4" short="$5" delay shift_n case "$rc" in 0) echo "done-check 0" ;; 2) echo "fatal 0" ;; 3) shift_n="$consec"; [ "$shift_n" -gt 4 ] && shift_n=4 delay=$(( base * (1 << shift_n) )) [ "$delay" -gt "$cap" ] && delay="$cap" echo "backoff $delay" ;; *) echo "retry $short" ;; esac } # scope_closed # 0 iff the goal and every descendant (-s*) is status≜proved. An empty # scope id never closes (unscoped runs end on the agent's own exit 0). scope_closed() { local dir="$1" goal="$2" f [ -n "$goal" ] || return 1 [ -e "$dir/$goal.aisp" ] || return 1 for f in "$dir/$goal.aisp" "$dir/$goal"-s*.aisp; do [ -e "$f" ] || continue grep -q 'status≜proved' "$f" || return 1 done return 0 } # duplicate_prs — stdin: "\t\t" for OPEN prove PRs; # stdout: the numbers to close — every PR that is not the OLDEST for its goal # (key = title up to the first ':', i.e. "prove(<goal>)"). Pure. duplicate_prs() { awk -F'\t' '{ split($3, t, ":"); key = t[1] if (!(key in best)) { best[key] = $1; created[key] = $2; next } if ($2 < created[key]) { print best[key]; best[key] = $1; created[key] = $2 } else { print $1 } }' } # ------------------------------------------------------------- gh plumbing # Scope-limited PR hygiene. Best-effort: any gh failure is logged and ignored # — hygiene must never take the supervisor down. sweep_pr_hygiene() { local goal="$1" rows n rows="$(gh pr list --state open --limit 50 --search "\"$goal\" in:title" \ --json number,createdAt,title \ --jq '.[] | select(.title|startswith("prove(")) | [.number, .createdAt, .title] | @tsv' \ 2>/dev/null)" || { log "pr hygiene: gh unavailable — skipped"; return 0; } [ -n "$rows" ] || return 0 while IFS= read -r n; do [ -n "$n" ] || continue log "closing duplicate prove PR #$n (older sibling kept)" gh pr close "$n" --comment \ "Duplicate prove PR for an already-claimed goal (claim-race window); closed by supervise.sh keeping the oldest open PR (ADR-017)." \ >/dev/null 2>&1 || log "could not close #$n — leaving it" done < <(printf '%s\n' "$rows" | duplicate_prs) # Conflicted PRs get no CI runs and an armed auto-merge never fires — shout. gh pr list --state open --limit 50 --search "\"$goal\" in:title" \ --json number,mergeable,title \ --jq '.[] | select(.mergeable=="CONFLICTING") | "#\(.number) \(.title)"' \ 2>/dev/null | while IFS= read -r n; do [ -n "$n" ] || continue log "ATTENTION: $n is CONFLICTING — no checks will run and auto-merge will never fire; maintainer resolution needed" done return 0 } # ---------------------------------------------------------------- self-test test_next_action() { local got got="$(next_action 0 0 300 3600 120)" [ "$got" = "done-check 0" ] || { log " rc0: want 'done-check 0', got '$got'"; return 1; } got="$(next_action 2 0 300 3600 120)" [ "$got" = "fatal 0" ] || { log " rc2: want 'fatal 0', got '$got'"; return 1; } got="$(next_action 1 0 300 3600 120)" [ "$got" = "retry 120" ] || { log " rc1: want 'retry 120', got '$got'"; return 1; } # Infra backoff doubles per consecutive failure and caps. got="$(next_action 3 0 300 3600 120)" [ "$got" = "backoff 300" ] || { log " infra0: want 'backoff 300', got '$got'"; return 1; } got="$(next_action 3 1 300 3600 120)" [ "$got" = "backoff 600" ] || { log " infra1: want 'backoff 600', got '$got'"; return 1; } got="$(next_action 3 3 300 3600 120)" [ "$got" = "backoff 2400" ] || { log " infra3: want 'backoff 2400', got '$got'"; return 1; } got="$(next_action 3 9 300 3600 120)" [ "$got" = "backoff 3600" ] || { log " infra9: want capped 'backoff 3600', got '$got'"; return 1; } } test_scope_closed() { local d; d="$(mktemp -d)" || return 1 printf 'status≜proved\n' > "$d/g.aisp" printf 'status≜proved\n' > "$d/g-s1.aisp" printf 'status≜open\n' > "$d/g-s2.aisp" scope_closed "$d" g && { log " open sub counted as closed"; rm -rf "$d"; return 1; } printf 'status≜proved\n' > "$d/g-s2.aisp" scope_closed "$d" g || { log " fully proved tree not closed"; rm -rf "$d"; return 1; } # Blocked parents and deep descendants hold the scope open. printf 'status≜blocked\n' > "$d/g-s2-s1.aisp" scope_closed "$d" g && { log " blocked grandchild counted as closed"; rm -rf "$d"; return 1; } # An empty scope id, or a missing root record, never closes. scope_closed "$d" "" && { log " empty scope closed"; rm -rf "$d"; return 1; } scope_closed "$d" missing && { log " missing root closed"; rm -rf "$d"; return 1; } rm -rf "$d" return 0 } test_duplicate_prs() { local got want got="$(printf '184\t2026-06-11T19:01:00Z\tprove(g-a): thm by alpha\n185\t2026-06-11T19:02:00Z\tprove(g-a): thm by bravo\n190\t2026-06-11T19:03:00Z\tprove(g-b): other by alpha\n' | duplicate_prs)" want="185" [ "$got" = "$want" ] || { log " dedupe: want '$want', got '$got'"; return 1; } # Order-independent: the oldest survives even when listed last. got="$(printf '185\t2026-06-11T19:02:00Z\tprove(g-a): thm by bravo\n184\t2026-06-11T19:01:00Z\tprove(g-a): thm by alpha\n' | duplicate_prs)" [ "$got" = "185" ] || { log " dedupe order: want '185', got '$got'"; return 1; } # No duplicates → nothing to close. got="$(printf '184\t2026-06-11T19:01:00Z\tprove(g-a): thm by alpha\n' | duplicate_prs)" [ -z "$got" ] || { log " singleton: want '', got '$got'"; return 1; } } run_self_test() { local tests=(test_next_action test_scope_closed test_duplicate_prs) local failures=0 t for t in "${tests[@]}"; do if "$t"; then printf 'PASS %s\n' "$t"; else printf 'FAIL %s\n' "$t"; failures=$((failures + 1)); fi done if [ "$failures" -gt 0 ]; then printf 'supervise self-test: %d of %d failed\n' "$failures" "${#tests[@]}" exit 1 fi printf 'supervise self-test: all %d tests passed\n' "${#tests[@]}" exit 0 } # --------------------------------------------------------------------- main main() { if [ "${1:-}" = "--self-test" ]; then run_self_test fi [ -f swarm/agent.sh ] || { log "run from the repository root"; exit 2; } local max_runs="${UNSORRY_SUP_MAX_RUNS:-50}" local base="${UNSORRY_SUP_BASE_BACKOFF:-300}" local cap="${UNSORRY_SUP_MAX_BACKOFF:-3600}" local short="${UNSORRY_SUP_SHORT_WAIT:-120}" local flight="${UNSORRY_SUP_FLIGHT_WAIT:-180}" local v for v in "$max_runs" "$base" "$cap" "$short" "$flight"; do [[ "$v" =~ ^[0-9]+$ ]] || { log "non-integer supervisor knob '$v'"; exit 2; } done # The --goal scope (if any) drives scope_closed; agent.sh re-validates it. local goal="" prev="" for v in "$@"; do [ "$prev" = "--goal" ] && goal="$v" prev="$v" done local runs=0 consec_infra=0 rc action verb delay while :; do runs=$((runs + 1)) if [ "$runs" -gt "$max_runs" ]; then log "run budget exhausted ($max_runs agent runs) — giving up" exit 1 fi git pull -q --ff-only 2>/dev/null || log "git pull failed — running on the current tree" if [ -n "$goal" ] && scope_closed goals "$goal"; then log "scope $goal is fully proved — done after $((runs - 1)) agent run(s)" exit 0 fi rc=0 ./swarm/agent.sh "$@" || rc=$? action="$(next_action "$rc" "$consec_infra" "$base" "$cap" "$short")" verb="${action% *}"; delay="${action#* }" case "$verb" in done-check) consec_infra=0 git pull -q --ff-only 2>/dev/null || true if [ -z "$goal" ]; then log "agent found nothing to do and no scope is set — done" exit 0 fi if scope_closed goals "$goal"; then log "scope $goal is fully proved — done after $runs agent run(s)" exit 0 fi log "pool empty but scope $goal still open — waiting ${flight}s for PRs/sweeps (run $runs/$max_runs)" sweep_pr_hygiene "$goal" sleep "$flight" ;; fatal) log "agent exited 2 (configuration) — a human must fix this" exit 2 ;; backoff) consec_infra=$((consec_infra + 1)) log "infrastructure failure #$consec_infra — backing off ${delay}s (run $runs/$max_runs)" sleep "$delay" ;; retry) consec_infra=0 log "cycle failure — retrying in ${delay}s (run $runs/$max_runs)" [ -n "$goal" ] && sweep_pr_hygiene "$goal" sleep "$delay" ;; esac done } main "$@"