name: CC drift watcher liveness

# Watches the watcher. The `cc-drift-template-watch.yml` workflow runs every
# 30 min on a self-hosted runner. If that runner goes offline — Hetzner
# reboot, container crash, OAuth credential revoked, CC binary missing,
# refresh failure — the watcher silently stops cycling and class-B drift
# goes uncaught. This alarm catches that case.
#
# Runs every 2 hours on a github-hosted runner (no auth, no self-hosted
# dependency — so it survives the exact failure modes it's meant to catch).
# Checks the most recent `success` run of cc-drift-template-watch.yml; if
# the latest success is more than 8 hours old (≥ 16 missed 30-min cycles),
# opens a `cc-watcher-liveness`-labeled issue. Auto-closes the issue when
# the watcher recovers.
#
# Cron offset to the :15 mark so it never overlaps with the 30-min cron
# of the watcher itself (:00 and :30) — though in practice GitHub Actions
# only fires the watcher's `*/30` schedule every ~4h on this repo (public
# free-tier scheduler is best-effort, not guaranteed). See the threshold
# rationale below.

on:
  schedule:
    - cron: '15 */2 * * *'
  workflow_dispatch:
    inputs:
      force_threshold_hours:
        description: 'Override THRESHOLD_HOURS for this run (validation only — leave blank for default 8h)'
        required: false
        type: string
        default: ''

permissions:
  contents: read
  issues: write

jobs:
  liveness:
    runs-on: ubuntu-latest
    timeout-minutes: 3
    steps:
      # v4.7.1: gh CLI commands below need a git context to resolve the
      # repo (otherwise `gh issue list` exits with "fatal: not a git
      # repository"). actions/checkout supplies it; we don't actually
      # need any files from the checkout, just the .git directory.
      - uses: askalf/checkout-with-retry@744195501c3e2b794c50370b753a7b8c93d084f5  # v1.0.0
        with:
          fetch-depth: 1

      - name: Check watcher last-successful-run age
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          # Hours-since-last-success threshold for alarming. Originally
          # 3h (= 6 missed 30-min cycles per the declared schedule), but
          # v4.7.1 bumped to 8h after observing real-world cadence on
          # this repo: GitHub Actions' free-tier cron scheduler is
          # best-effort, and the watcher's `*/30 * * * *` typically fires
          # every 2-4 hours in practice. 8h still catches real outages
          # (anything > a day-shift of silence is signal, not noise)
          # while absorbing the scheduler skew.
          #
          # v4.7.2: workflow_dispatch can override THRESHOLD_HOURS for
          # validation runs. Lets us exercise the alert-open path on
          # demand (set inputs.force_threshold_hours=1, dispatch, watch
          # the issue open) without waiting for or synthesizing a real
          # outage. Default empty means "use the hardcoded 8".
          THRESHOLD_HOURS: ${{ github.event.inputs.force_threshold_hours != '' && github.event.inputs.force_threshold_hours || '8' }}
        run: |
          set -euo pipefail

          # Self-heal: ensure the alarm label exists before any issue op.
          # Idempotent — the `|| true` swallows "already exists" errors.
          gh label create cc-watcher-liveness \
            --description "Drift watcher has not had a successful run within liveness threshold" \
            --color "B60205" 2>/dev/null || true

          # Most recent successful run of the drift watcher. The drift
          # watcher's workflow run is `success` for both exit 0 (no drift)
          # AND exit 2 (drift detected) — only exit 1 (infra failure)
          # surfaces as `failure`. So "success" here is "watcher cycled
          # cleanly," which is what liveness means.
          last_success=$(gh api \
            "repos/${{ github.repository }}/actions/workflows/cc-drift-template-watch.yml/runs?status=success&per_page=1" \
            --jq '.workflow_runs[0].created_at // ""')

          if [ -z "$last_success" ]; then
            echo "No successful watcher runs yet — likely a fresh repo or watcher disabled. Skipping alarm."
            exit 0
          fi

          now=$(date -u +%s)
          last=$(date -u -d "$last_success" +%s)
          hours_since=$(( (now - last) / 3600 ))

          echo "Last successful watcher run: $last_success ($hours_since hours ago, threshold ${THRESHOLD_HOURS}h)"

          # Look up any existing open alert issue so we can close it on the
          # healthy path. This lookup is non-blocking — a transient API
          # failure (e.g. HTTP 401 during GH-Actions token provisioning race,
          # observed 2026-05-23 at 11:36 UTC despite same workflow passing
          # 90 min earlier) must NOT fail the whole liveness check, because
          # the watcher's actual liveness was already verified above. We
          # tolerate auth + transient errors and treat them as "no existing
          # issue" — worst case is we don't close a stale alert this cycle
          # (next cycle will retry).
          existing=$(gh issue list --label cc-watcher-liveness --state open --json number --jq '.[0].number' 2>/dev/null) || existing=""

          if [ "$hours_since" -lt "$THRESHOLD_HOURS" ]; then
            echo "Watcher healthy."
            if [ -n "$existing" ] && [ "$existing" != "null" ]; then
              # Same tolerance on the close — if it fails for any transient
              # reason, log a warning + continue. The issue will get closed
              # on the next cycle.
              if gh issue close "$existing" --comment "Watcher recovered: last successful run ${hours_since} hours ago (${last_success}). Closing this alert." 2>/dev/null; then
                echo "Closed stale alert #$existing"
              else
                echo "::warning::Failed to close stale alert #$existing (transient API error); will retry next cycle"
              fi
            fi
            exit 0
          fi

          # ============================================================
          # Past threshold — try SELF-RECOVERY before notifying operator.
          # (Move 6 of the 2026-05-23 dario absolute-minimal-human push.)
          # The vast majority of "watcher offline" causes are recoverable
          # by simply re-dispatching the watcher workflow: the most-common
          # root cause is the watcher cron just got skipped by GitHub's
          # free-tier scheduler. Try one workflow_dispatch + wait for the
          # result before escalating to a GitHub issue.
          # ============================================================
          echo "::warning::cc-drift-template-watch.yml has no successful run in $hours_since hours (threshold: ${THRESHOLD_HOURS}h). Attempting self-recovery before notifying operator."

          # Skip self-recovery on `workflow_dispatch` with force_threshold_hours
          # set — that's the operator's deliberate "exercise the alert path"
          # dry-run. Don't actually recover; just open the issue.
          if [ -n "${{ github.event.inputs.force_threshold_hours }}" ]; then
            echo "force_threshold_hours dry-run mode — skipping self-recovery, going straight to alert."
          else
            echo "[$(date -Iseconds)] Triggering cc-drift-template-watch.yml via workflow_dispatch..."
            if gh workflow run cc-drift-template-watch.yml --ref master 2>/dev/null; then
              echo "Dispatch sent. Waiting up to 120s for a fresh successful run..."

              # Poll for a new successful run more recent than $last_success
              recovered=false
              for i in $(seq 1 12); do
                sleep 10
                latest=$(gh api "repos/${{ github.repository }}/actions/workflows/cc-drift-template-watch.yml/runs?status=success&per_page=1" --jq '.workflow_runs[0].created_at // ""' 2>/dev/null || echo "")
                if [ -n "$latest" ] && [ "$latest" != "$last_success" ]; then
                  echo "[${i}0s] Watcher recovered — new successful run at $latest (was $last_success)"
                  recovered=true
                  break
                fi
                echo "[${i}0s] not yet; still showing $latest"
              done

              if [ "$recovered" = "true" ]; then
                echo "Self-recovery succeeded. Skipping alert. Closing any stale alert issue."
                if [ -n "$existing" ] && [ "$existing" != "null" ]; then
                  gh issue close "$existing" --comment "Watcher self-recovered via liveness-triggered dispatch. Closing this alert." 2>/dev/null || true
                fi
                exit 0
              fi

              echo "Self-recovery did NOT produce a fresh success within 120s. Escalating to operator alert."
            else
              echo "::warning::workflow_dispatch of cc-drift-template-watch.yml failed (gh API transient?). Escalating to operator alert."
            fi
          fi

          body_file=$(mktemp)
          {
            echo "## CC drift watcher offline"
            echo ""
            echo "Generated by [\`cc-drift-watcher-liveness.yml\`](.github/workflows/cc-drift-watcher-liveness.yml) on $(date -Iseconds)."
            echo ""
            echo "**Last successful run of [\`cc-drift-template-watch.yml\`](.github/workflows/cc-drift-template-watch.yml): ${hours_since} hours ago** (\`${last_success}\`)."
            echo ""
            echo "Declared cadence: every 30 minutes (\`*/30 * * * *\`). Observed cadence on this repo: typically every 2-4 hours (GitHub Actions' free-tier cron scheduler is best-effort). Threshold for this alarm: ${THRESHOLD_HOURS} hours of silence."
            echo ""
            echo "### Likely causes"
            echo ""
            echo "- Self-hosted runner offline (Hetzner reboot, container restart, network issue)"
            echo "- OAuth credential expired / revoked / refresh failed (look for 401 in the last run's logs)"
            echo "- CC binary missing or upgraded incompatibly"
            echo "- Repository or cron config changed"
            echo ""
            echo "### How to investigate"
            echo ""
            echo "1. Check runner status: [Settings → Actions → Runners](../../settings/actions/runners) — is \`askalf-platform-1\` showing **Online**?"
            echo "2. Check recent runs: [Actions → CC template drift watch (self-hosted)](../../actions/workflows/cc-drift-template-watch.yml) — read the latest failure log"
            echo "3. SSH to the box and probe auth: \`claude --print < /dev/null\` (shares /root/.claude/.credentials.json with the platform dario; platform dario container's /health should show \`oauth: healthy\`)"
            echo "4. Re-run manually after fixing: \`gh workflow run cc-drift-template-watch.yml --ref master\`"
            echo ""
            echo "This alert auto-closes when the watcher next reports a successful run."
          } > "$body_file"

          if [ -n "$existing" ] && [ "$existing" != "null" ]; then
            gh issue comment "$existing" --body-file "$body_file"
            echo "Updated existing alert #$existing"
          else
            gh issue create \
              --title "Watcher offline: no successful run in ${hours_since} hours" \
              --label cc-watcher-liveness \
              --body-file "$body_file"
          fi