name: CC drift watcher liveness # Watches the watcher. The `cc-drift-template-watch.yml` workflow runs every # 30 min on a self-hosted runner. If that runner goes offline — Hetzner # reboot, container crash, OAuth credential revoked, CC binary missing, # refresh failure — the watcher silently stops cycling and class-B drift # goes uncaught. This alarm catches that case. # # Runs every 2 hours on a github-hosted runner (no auth, no self-hosted # dependency — so it survives the exact failure modes it's meant to catch). # Checks the most recent `success` run of cc-drift-template-watch.yml; if # the latest success is more than 8 hours old (≥ 16 missed 30-min cycles), # opens a `cc-watcher-liveness`-labeled issue. Auto-closes the issue when # the watcher recovers. # # Cron offset to the :15 mark so it never overlaps with the 30-min cron # of the watcher itself (:00 and :30) — though in practice GitHub Actions # only fires the watcher's `*/30` schedule every ~4h on this repo (public # free-tier scheduler is best-effort, not guaranteed). See the threshold # rationale below. on: schedule: - cron: '15 */2 * * *' workflow_dispatch: inputs: force_threshold_hours: description: 'Override THRESHOLD_HOURS for this run (validation only — leave blank for default 8h)' required: false type: string default: '' permissions: contents: read issues: write jobs: liveness: runs-on: ubuntu-latest timeout-minutes: 3 steps: # v4.7.1: gh CLI commands below need a git context to resolve the # repo (otherwise `gh issue list` exits with "fatal: not a git # repository"). actions/checkout supplies it; we don't actually # need any files from the checkout, just the .git directory. - uses: askalf/checkout-with-retry@744195501c3e2b794c50370b753a7b8c93d084f5 # v1.0.0 with: fetch-depth: 1 - name: Check watcher last-successful-run age env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Hours-since-last-success threshold for alarming. Originally # 3h (= 6 missed 30-min cycles per the declared schedule), but # v4.7.1 bumped to 8h after observing real-world cadence on # this repo: GitHub Actions' free-tier cron scheduler is # best-effort, and the watcher's `*/30 * * * *` typically fires # every 2-4 hours in practice. 8h still catches real outages # (anything > a day-shift of silence is signal, not noise) # while absorbing the scheduler skew. # # v4.7.2: workflow_dispatch can override THRESHOLD_HOURS for # validation runs. Lets us exercise the alert-open path on # demand (set inputs.force_threshold_hours=1, dispatch, watch # the issue open) without waiting for or synthesizing a real # outage. Default empty means "use the hardcoded 8". THRESHOLD_HOURS: ${{ github.event.inputs.force_threshold_hours != '' && github.event.inputs.force_threshold_hours || '8' }} run: | set -euo pipefail # Self-heal: ensure the alarm label exists before any issue op. # Idempotent — the `|| true` swallows "already exists" errors. gh label create cc-watcher-liveness \ --description "Drift watcher has not had a successful run within liveness threshold" \ --color "B60205" 2>/dev/null || true # Most recent successful run of the drift watcher. The drift # watcher's workflow run is `success` for both exit 0 (no drift) # AND exit 2 (drift detected) — only exit 1 (infra failure) # surfaces as `failure`. So "success" here is "watcher cycled # cleanly," which is what liveness means. last_success=$(gh api \ "repos/${{ github.repository }}/actions/workflows/cc-drift-template-watch.yml/runs?status=success&per_page=1" \ --jq '.workflow_runs[0].created_at // ""') if [ -z "$last_success" ]; then echo "No successful watcher runs yet — likely a fresh repo or watcher disabled. Skipping alarm." exit 0 fi now=$(date -u +%s) last=$(date -u -d "$last_success" +%s) hours_since=$(( (now - last) / 3600 )) echo "Last successful watcher run: $last_success ($hours_since hours ago, threshold ${THRESHOLD_HOURS}h)" # Look up any existing open alert issue so we can close it on the # healthy path. This lookup is non-blocking — a transient API # failure (e.g. HTTP 401 during GH-Actions token provisioning race, # observed 2026-05-23 at 11:36 UTC despite same workflow passing # 90 min earlier) must NOT fail the whole liveness check, because # the watcher's actual liveness was already verified above. We # tolerate auth + transient errors and treat them as "no existing # issue" — worst case is we don't close a stale alert this cycle # (next cycle will retry). existing=$(gh issue list --label cc-watcher-liveness --state open --json number --jq '.[0].number' 2>/dev/null) || existing="" if [ "$hours_since" -lt "$THRESHOLD_HOURS" ]; then echo "Watcher healthy." if [ -n "$existing" ] && [ "$existing" != "null" ]; then # Same tolerance on the close — if it fails for any transient # reason, log a warning + continue. The issue will get closed # on the next cycle. if gh issue close "$existing" --comment "Watcher recovered: last successful run ${hours_since} hours ago (${last_success}). Closing this alert." 2>/dev/null; then echo "Closed stale alert #$existing" else echo "::warning::Failed to close stale alert #$existing (transient API error); will retry next cycle" fi fi exit 0 fi # ============================================================ # Past threshold — try SELF-RECOVERY before notifying operator. # (Move 6 of the 2026-05-23 dario absolute-minimal-human push.) # The vast majority of "watcher offline" causes are recoverable # by simply re-dispatching the watcher workflow: the most-common # root cause is the watcher cron just got skipped by GitHub's # free-tier scheduler. Try one workflow_dispatch + wait for the # result before escalating to a GitHub issue. # ============================================================ echo "::warning::cc-drift-template-watch.yml has no successful run in $hours_since hours (threshold: ${THRESHOLD_HOURS}h). Attempting self-recovery before notifying operator." # Skip self-recovery on `workflow_dispatch` with force_threshold_hours # set — that's the operator's deliberate "exercise the alert path" # dry-run. Don't actually recover; just open the issue. if [ -n "${{ github.event.inputs.force_threshold_hours }}" ]; then echo "force_threshold_hours dry-run mode — skipping self-recovery, going straight to alert." else echo "[$(date -Iseconds)] Triggering cc-drift-template-watch.yml via workflow_dispatch..." if gh workflow run cc-drift-template-watch.yml --ref master 2>/dev/null; then echo "Dispatch sent. Waiting up to 120s for a fresh successful run..." # Poll for a new successful run more recent than $last_success recovered=false for i in $(seq 1 12); do sleep 10 latest=$(gh api "repos/${{ github.repository }}/actions/workflows/cc-drift-template-watch.yml/runs?status=success&per_page=1" --jq '.workflow_runs[0].created_at // ""' 2>/dev/null || echo "") if [ -n "$latest" ] && [ "$latest" != "$last_success" ]; then echo "[${i}0s] Watcher recovered — new successful run at $latest (was $last_success)" recovered=true break fi echo "[${i}0s] not yet; still showing $latest" done if [ "$recovered" = "true" ]; then echo "Self-recovery succeeded. Skipping alert. Closing any stale alert issue." if [ -n "$existing" ] && [ "$existing" != "null" ]; then gh issue close "$existing" --comment "Watcher self-recovered via liveness-triggered dispatch. Closing this alert." 2>/dev/null || true fi exit 0 fi echo "Self-recovery did NOT produce a fresh success within 120s. Escalating to operator alert." else echo "::warning::workflow_dispatch of cc-drift-template-watch.yml failed (gh API transient?). Escalating to operator alert." fi fi body_file=$(mktemp) { echo "## CC drift watcher offline" echo "" echo "Generated by [\`cc-drift-watcher-liveness.yml\`](.github/workflows/cc-drift-watcher-liveness.yml) on $(date -Iseconds)." echo "" echo "**Last successful run of [\`cc-drift-template-watch.yml\`](.github/workflows/cc-drift-template-watch.yml): ${hours_since} hours ago** (\`${last_success}\`)." echo "" echo "Declared cadence: every 30 minutes (\`*/30 * * * *\`). Observed cadence on this repo: typically every 2-4 hours (GitHub Actions' free-tier cron scheduler is best-effort). Threshold for this alarm: ${THRESHOLD_HOURS} hours of silence." echo "" echo "### Likely causes" echo "" echo "- Self-hosted runner offline (Hetzner reboot, container restart, network issue)" echo "- OAuth credential expired / revoked / refresh failed (look for 401 in the last run's logs)" echo "- CC binary missing or upgraded incompatibly" echo "- Repository or cron config changed" echo "" echo "### How to investigate" echo "" echo "1. Check runner status: [Settings → Actions → Runners](../../settings/actions/runners) — is \`askalf-platform-1\` showing **Online**?" echo "2. Check recent runs: [Actions → CC template drift watch (self-hosted)](../../actions/workflows/cc-drift-template-watch.yml) — read the latest failure log" echo "3. SSH to the box and probe auth: \`claude --print < /dev/null\` (shares /root/.claude/.credentials.json with the platform dario; platform dario container's /health should show \`oauth: healthy\`)" echo "4. Re-run manually after fixing: \`gh workflow run cc-drift-template-watch.yml --ref master\`" echo "" echo "This alert auto-closes when the watcher next reports a successful run." } > "$body_file" if [ -n "$existing" ] && [ "$existing" != "null" ]; then gh issue comment "$existing" --body-file "$body_file" echo "Updated existing alert #$existing" else gh issue create \ --title "Watcher offline: no successful run in ${hours_since} hours" \ --label cc-watcher-liveness \ --body-file "$body_file" fi