name: Update CIA CSV Data

# Refreshes every CSV already tracked under data/cia/** and cia-data/** from the
# upstream Hack23/cia sample-data tree:
#   https://github.com/Hack23/cia/tree/master/service.data.impl/sample-data
#
# Upstream layout is NOT flat — root-level files coexist with sub-folders
# (distinct_values/, framework-validation/, risk-rule-tests/). The workflow
# therefore builds a basename→upstream-path index via the GitHub Tree API
# (recursive) before downloading, so every tracked local CSV is resolved to
# its correct upstream path regardless of sub-folder.
#
# The workflow is intentionally update-only: it never pulls in CSV files that
# are not already tracked in this repository.
#
# After the CSV refresh it runs the production-stats pipeline
# (scripts/load-cia-stats.ts + scripts/update-stats-from-cia.ts) so the
# homepage statistics (index*.html) stay in sync with
# cia-data/production-stats.json derived from extraction_summary_report.csv.
#
# When either stage produces changes, a single pull request is opened.

on:
  schedule:
    # Daily at 03:30 UTC (~04:30 CET / 05:30 CEST), safely after the upstream
    # CIA extraction job which completes around 02:57 UTC. The 30-minute margin
    # absorbs upstream runtime variance and ensures we refresh from the latest
    # extraction rather than the previous day's snapshot.
    - cron: '30 3 * * *'
  workflow_dispatch:
    inputs:
      ref:
        description: 'Upstream Hack23/cia git ref (branch, tag, or SHA) to download from'
        required: false
        type: string
        default: 'master'

permissions:
  contents: read

jobs:
  refresh:
    name: Refresh CIA CSV data and production stats
    runs-on: ubuntu-26.04
    permissions:
      contents: write
      pull-requests: write
    env:
      UPSTREAM_REF: ${{ github.event.inputs.ref || 'master' }}
      UPSTREAM_REPO: Hack23/cia
      UPSTREAM_PATH: service.data.impl/sample-data

    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
        with:
          egress-policy: audit
          allowed-endpoints: >
            api.github.com:443
            github.com:443
            raw.githubusercontent.com:443
            objects.githubusercontent.com:443
            codeload.github.com:443
            registry.npmjs.org:443
            nodejs.org:443

      - name: Checkout repository
        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          fetch-depth: 1

      - name: Setup Node.js
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '26'
          cache: 'npm'
          cache-dependency-path: |
            package-lock.json
            .github/workflows/update-cia-csv-data.yml

      - name: Install dependencies
        run: npm ci

      - name: Resolve upstream ref → SHA
        id: resolve
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          set -euo pipefail

          # URL-encode the ref so branch names that legitimately contain '/'
          # (e.g. "feature/foo") survive interpolation into the GitHub API
          # path. jq is used as a dependency-free URL-encoder.
          encoded_ref="$(printf '%s' "$UPSTREAM_REF" | jq -rR @uri)"
          echo "🔎 Resolving ${UPSTREAM_REPO}@${UPSTREAM_REF} (encoded: ${encoded_ref})"

          # Ask the commits endpoint for the fully-resolved SHA. This accepts
          # branches, tags, and raw SHAs uniformly and returns the tip commit
          # SHA. All downstream URLs (tree API + raw downloads) use that SHA,
          # so there's no risk of a mid-run upstream update producing an
          # inconsistent CSV / stats snapshot, and no further URL-encoding
          # is required because a SHA is always [0-9a-f].
          sha="$(curl --silent --show-error --fail --location --retry 3 --max-time 30 \
            -H "Accept: application/vnd.github.v3+json" \
            -H "Authorization: Bearer ${GH_TOKEN}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            "https://api.github.com/repos/${UPSTREAM_REPO}/commits/${encoded_ref}" \
            | jq -r '.sha')"

          if [ -z "$sha" ] || [ "$sha" = "null" ]; then
            echo "::error::Failed to resolve ${UPSTREAM_REPO}@${UPSTREAM_REF} to a commit SHA"
            exit 1
          fi
          echo "✅ Resolved SHA: ${sha}"
          echo "sha=${sha}" >> "$GITHUB_OUTPUT"

      - name: Build upstream basename → path index
        id: index
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
        run: |
          set -euo pipefail

          echo "📡 Fetching recursive tree for ${UPSTREAM_REPO}@${UPSTREAM_SHA}"
          tree_json="$(mktemp)"
          curl --silent --show-error --fail --location --retry 3 --max-time 60 \
            -H "Accept: application/vnd.github.v3+json" \
            -H "Authorization: Bearer ${GH_TOKEN}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            "https://api.github.com/repos/${UPSTREAM_REPO}/git/trees/${UPSTREAM_SHA}?recursive=1" \
            -o "$tree_json"

          # The GitHub "get a tree" endpoint returns truncated=true when the
          # tree exceeds ~100k entries or 7MB. The upstream Hack23/cia tree is
          # currently well under those limits (<10k entries), so a single
          # recursive call is sufficient. If the repo ever grows past the
          # threshold, this step fails fast rather than silently producing an
          # incomplete index — at which point the workflow should be updated
          # to fall back to per-subdirectory tree fetches.
          truncated="$(jq -r '.truncated' "$tree_json")"
          if [ "$truncated" = "true" ]; then
            echo "::error::GitHub tree response is truncated; upstream repo has grown past the single-call limit. Update the workflow to paginate sub-trees."
            exit 1
          fi

          # index.tsv: <basename>\t<upstream-relative-path> (duplicate basenames rejected)
          index_file="${RUNNER_TEMP:-/tmp}/cia_sample_data_index.tsv"
          jq -r --arg prefix "${UPSTREAM_PATH}/" '
            .tree[]
            | select(.type == "blob")
            | select(.path | startswith($prefix))
            | select(.path | endswith(".csv"))
            | .path | ltrimstr($prefix)
          ' "$tree_json" | awk -F/ '{ print $NF "\t" $0 }' | sort > "$index_file"

          total_csvs="$(wc -l < "$index_file")"
          dup_count="$(cut -f1 "$index_file" | sort | uniq -d | wc -l)"
          if [ "$dup_count" -gt 0 ]; then
            echo "::error::Upstream contains ${dup_count} duplicate CSV basenames; basename-based lookup is ambiguous:"
            cut -f1 "$index_file" | sort | uniq -d
            exit 1
          fi

          echo "📚 Upstream index: ${total_csvs} CSVs (no duplicate basenames)"
          echo "index_file=$index_file" >> "$GITHUB_OUTPUT"
          echo "total_upstream=$total_csvs" >> "$GITHUB_OUTPUT"
          rm -f "$tree_json"

      - name: Refresh tracked CSV files from upstream
        id: refresh
        env:
          INDEX_FILE: ${{ steps.index.outputs.index_file }}
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
        run: |
          set -euo pipefail

          # Use the resolved SHA (not the raw ref) so downloads are pinned to
          # the exact same commit the index was built from, and no URL-encoding
          # is needed for the raw.githubusercontent.com path.
          BASE_URL="https://raw.githubusercontent.com/${UPSTREAM_REPO}/${UPSTREAM_SHA}/${UPSTREAM_PATH}"
          echo "📥 Source base URL: ${BASE_URL}"
          echo "📚 Using index: ${INDEX_FILE}"

          # Only touch CSVs that are ALREADY tracked in the repository under
          # data/cia/** or cia-data/**. We never introduce new files.
          mapfile -t LOCAL_CSVS < <(
            find data/cia cia-data -type f -name '*.csv' 2>/dev/null | sort -u
          )

          total=${#LOCAL_CSVS[@]}
          updated=0
          unchanged=0
          missing=0
          skipped=0
          failed=0

          # Locally-curated files with no upstream equivalent; these must not be
          # overwritten by the upstream sample-data tree.
          declare -a SKIP_PATTERNS=(
            # Local hand-curated mini-samples used by dashboard demos; their
            # schemas (name,ministry,influence / impact / riskScore / quarter)
            # do not exist in upstream service.data.impl/sample-data.
            'data/cia/ministry/sample_influence.csv'
            'data/cia/ministry/sample_decision_impact.csv'
            'data/cia/ministry/sample_risk_levels.csv'
            'data/cia/ministry/sample_productivity.csv'
            # Locally-generated forecast products (election scenarios) that
            # are not published in the upstream CIA sample-data tree.
            'cia-data/election/election_forecast.csv'
            'cia-data/election/coalition_scenarios.csv'
          )

          is_skipped() {
            local path="$1"
            for pat in "${SKIP_PATTERNS[@]}"; do
              if [ "$path" = "$pat" ]; then
                return 0
              fi
            done
            return 1
          }

          # Resolve a local basename to its upstream-relative path. Falls back
          # to the "<stem>_sample.csv" alias, which the repo uses for a few
          # locally-renamed canonical files (e.g. view_riksdagen_committee_decisions.csv).
          resolve_upstream_path() {
            local base="$1"
            local hit
            hit="$(awk -v b="$base" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")"
            if [ -n "$hit" ]; then
              printf '%s' "$hit"
              return 0
            fi
            local alias="${base%.csv}_sample.csv"
            hit="$(awk -v b="$alias" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")"
            if [ -n "$hit" ]; then
              printf '%s' "$hit"
              return 0
            fi
            return 1
          }

          tmpdir="$(mktemp -d)"
          trap 'rm -rf "$tmpdir"' EXIT
          : > "$tmpdir/updated.txt"
          : > "$tmpdir/missing.txt"
          : > "$tmpdir/failed.txt"

          for local_path in "${LOCAL_CSVS[@]}"; do
            base="$(basename "$local_path")"

            if is_skipped "$local_path"; then
              echo "  ⏭  skip   $local_path (local-only sample, no upstream)"
              skipped=$((skipped + 1))
              continue
            fi

            if ! upstream_rel="$(resolve_upstream_path "$base")"; then
              echo "  ❔ miss   $local_path (no upstream basename match)"
              missing=$((missing + 1))
              echo "$local_path" >> "$tmpdir/missing.txt"
              continue
            fi

            url="${BASE_URL}/${upstream_rel}"
            tmpfile="$tmpdir/$base"

            http_code="$(curl --silent --show-error --location --retry 3 \
              --max-time 60 --output "$tmpfile" --write-out '%{http_code}' \
              "$url" || echo '000')"

            case "$http_code" in
              200)
                if [ ! -s "$tmpfile" ]; then
                  echo "  ⚠️  empty  $local_path  ($url)"
                  failed=$((failed + 1))
                  echo "$local_path" >> "$tmpdir/failed.txt"
                  continue
                fi
                if cmp --silent "$tmpfile" "$local_path"; then
                  unchanged=$((unchanged + 1))
                else
                  mkdir -p "$(dirname "$local_path")"
                  mv "$tmpfile" "$local_path"
                  updated=$((updated + 1))
                  echo "$local_path  <- ${upstream_rel}" >> "$tmpdir/updated.txt"
                  echo "  ✅ update $local_path  <- ${upstream_rel}"
                fi
                ;;
              404)
                echo "  ❔ miss   $local_path (404 at ${upstream_rel})"
                missing=$((missing + 1))
                echo "$local_path" >> "$tmpdir/missing.txt"
                ;;
              *)
                echo "  ❌ http=$http_code $local_path ($url)"
                failed=$((failed + 1))
                echo "$local_path" >> "$tmpdir/failed.txt"
                ;;
            esac
          done

          {
            echo "total=$total"
            echo "updated=$updated"
            echo "unchanged=$unchanged"
            echo "missing=$missing"
            echo "skipped=$skipped"
            echo "failed=$failed"
          } >> "$GITHUB_OUTPUT"

          {
            echo "## 📊 CIA CSV Refresh Summary"
            echo ""
            echo "| Metric | Count |"
            echo "| --- | ---: |"
            echo "| Total tracked CSVs | $total |"
            echo "| ✅ Updated | $updated |"
            echo "| 🟰 Unchanged | $unchanged |"
            echo "| ❔ Missing upstream | $missing |"
            echo "| ⏭ Skipped (local-only) | $skipped |"
            echo "| ❌ Failed downloads | $failed |"
            echo ""
            echo "- Upstream ref: \`${UPSTREAM_REF}\`"
            echo "- Source base: ${BASE_URL}"
            echo "- Upstream index size: ${{ steps.index.outputs.total_upstream }} CSVs"

            if [ "$updated" -gt 0 ]; then
              echo ""
              echo "### Updated files"
              echo ""
              sed 's/^/- /' "$tmpdir/updated.txt"
            fi
            if [ "$missing" -gt 0 ]; then
              echo ""
              echo "### Missing upstream"
              echo ""
              sed 's/^/- /' "$tmpdir/missing.txt"
            fi
            if [ "$failed" -gt 0 ]; then
              echo ""
              echo "### Failed downloads"
              echo ""
              sed 's/^/- /' "$tmpdir/failed.txt"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

          if [ "$failed" -gt 0 ]; then
            echo "::error::$failed CSV downloads failed; see job summary"
            exit 1
          fi

      - name: Refresh CIA production statistics
        id: stats
        env:
          UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }}
          # Pin the statistics fetch to the exact same commit SHA the CSV
          # refresh just used. `scripts/load-cia-stats.ts` respects the
          # CIA_EXTRACTION_SUMMARY_URL env var as an override of its default
          # master-branch URL, so the injected counts stay consistent with the
          # CSV snapshot when the workflow is dispatched with a non-master ref.
          CIA_EXTRACTION_SUMMARY_URL: https://raw.githubusercontent.com/${{ env.UPSTREAM_REPO }}/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }}/extraction_summary_report.csv
        run: |
          set -euo pipefail

          echo "📊 Fetching CIA production statistics from ${CIA_EXTRACTION_SUMMARY_URL}"
          npx --no-install tsx scripts/load-cia-stats.ts

          if [ ! -f "cia-data/production-stats.json" ]; then
            echo "::error::cia-data/production-stats.json was not produced"
            exit 1
          fi

          total_persons="$(jq -r '.counts.total_persons' cia-data/production-stats.json)"
          total_votes="$(jq -r '.counts.total_votes' cia-data/production-stats.json)"
          last_updated="$(jq -r '.metadata.last_updated' cia-data/production-stats.json)"

          {
            echo "total_persons=$total_persons"
            echo "total_votes=$total_votes"
            echo "last_updated=$last_updated"
          } >> "$GITHUB_OUTPUT"

          echo "🖊  Injecting statistics into index*.html (14 language variants)"
          npx --no-install tsx scripts/update-stats-from-cia.ts

          {
            echo ""
            echo "## 📈 Production Statistics Refresh"
            echo ""
            echo "| Metric | Value |"
            echo "| --- | ---: |"
            echo "| Total Persons | ${total_persons} |"
            echo "| Total Votes | ${total_votes} |"
            echo "| Last Extraction | ${last_updated} |"
            echo "| Pinned commit | \`${UPSTREAM_SHA}\` |"
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Detect git changes
        id: diff
        run: |
          set -e
          # Paths touched by this workflow: CSV trees + stats JSON + injected HTML
          paths=(data/cia cia-data/production-stats.json cia-data/*.csv cia-data/*/*.csv index.html index_*.html)
          if [ -n "$(git status --porcelain -- "${paths[@]}")" ]; then
            echo "has_changes=true" >> "$GITHUB_OUTPUT"
            echo "📝 Changes detected"
            git status --porcelain -- "${paths[@]}" | head -30
          else
            echo "has_changes=false" >> "$GITHUB_OUTPUT"
            echo "✅ No changes detected"
          fi

      - name: Create pull request with refreshed data
        if: steps.diff.outputs.has_changes == 'true'
        uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          add-paths: |
            data/cia/**
            cia-data/**
            index.html
            index_*.html
          commit-message: |
            data(cia): refresh CSV sample data + production stats from ${{ env.UPSTREAM_REPO }}@${{ steps.resolve.outputs.sha }}

            - Upstream ref: ${{ env.UPSTREAM_REF }} (resolved to ${{ steps.resolve.outputs.sha }})
            - CSV files: updated ${{ steps.refresh.outputs.updated }} / ${{ steps.refresh.outputs.total }} tracked
            - Production stats: ${{ steps.stats.outputs.total_persons }} persons, ${{ steps.stats.outputs.total_votes }} votes
            - Last extraction: ${{ steps.stats.outputs.last_updated }}
          branch: data/cia-csv-refresh
          delete-branch: true
          title: '📊 Refresh CIA data (${{ steps.refresh.outputs.updated }} CSVs + production stats)'
          body: |
            ## 📊 Automated CIA data refresh

            Pulled the latest `service.data.impl/sample-data/` CSVs from
            [`${{ env.UPSTREAM_REPO }}@${{ env.UPSTREAM_REF }}`](https://github.com/${{ env.UPSTREAM_REPO }}/tree/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }})
            (resolved to commit [`${{ steps.resolve.outputs.sha }}`](https://github.com/${{ env.UPSTREAM_REPO }}/commit/${{ steps.resolve.outputs.sha }}))
            and re-ran the production-statistics pipeline against the same commit.

            ### CSV refresh
            | Metric | Count |
            | --- | ---: |
            | Total tracked CSVs | ${{ steps.refresh.outputs.total }} |
            | ✅ Updated | ${{ steps.refresh.outputs.updated }} |
            | 🟰 Unchanged | ${{ steps.refresh.outputs.unchanged }} |
            | ❔ Missing upstream | ${{ steps.refresh.outputs.missing }} |
            | ⏭ Skipped (local-only) | ${{ steps.refresh.outputs.skipped }} |
            | ❌ Failed downloads | ${{ steps.refresh.outputs.failed }} |

            ### Production statistics
            - Total persons: **${{ steps.stats.outputs.total_persons }}**
            - Total votes: **${{ steps.stats.outputs.total_votes }}**
            - Last extraction: **${{ steps.stats.outputs.last_updated }}**

            ### How resolution works
            The workflow never introduces new files — it only refreshes CSVs
            already tracked under `data/cia/**` and `cia-data/**`. Upstream paths
            are resolved through a recursive `basename → upstream-path` index
            built from the GitHub Tree API, so files living in
            sub-folders (`distinct_values/`, `framework-validation/`,
            `risk-rule-tests/`) are located correctly even though local paths
            are organised differently. An `<stem>_sample.csv` alias is applied
            for the handful of locally-renamed canonical files.

            ### Review checklist
            - [ ] CSV diffs look reasonable (no schema regressions)
            - [ ] `production-stats.json` counts move monotonically or within reason
            - [ ] Dashboards still render with the refreshed data
            - [ ] No unexpected file additions or removals

            > Generated by `.github/workflows/update-cia-csv-data.yml`.
          labels: |
            automated-pipeline
            data-update
            cia-intelligence

# ISMS Compliance
# - ISO 27001:2022 A.5.33 Protection of records — Git audit trail + PR review gate
# - ISO 27001:2022 A.8.3  Information lifecycle management — nightly refresh + change detection
# - ISO 27001:2022 A.8.10 Information deletion — auto-delete PR branch after merge
# - ISO 27001:2022 A.8.19 Security in use — HTTPS-only downloads, SHA-pinned actions
# - NIST CSF 2.0 PR.DS-5 Data integrity — byte-level cmp vs upstream
# - NIST CSF 2.0 DE.CM-1 Network monitoring — step-security/harden-runner egress audit
# - CIS Controls v8.1 3.1  Data inventory — explicit local-file discovery
# - CIS Controls v8.1 3.14 Data integrity validation — diff + PR review before merge
# - GDPR Article 6(1)(e) — public interest processing (democratic transparency)
# - Swedish Offentlighetsprincipen — public access to government information