name: Update CIA CSV Data # Refreshes every CSV already tracked under data/cia/** and cia-data/** from the # upstream Hack23/cia sample-data tree: # https://github.com/Hack23/cia/tree/master/service.data.impl/sample-data # # Upstream layout is NOT flat β€” root-level files coexist with sub-folders # (distinct_values/, framework-validation/, risk-rule-tests/). The workflow # therefore builds a basenameβ†’upstream-path index via the GitHub Tree API # (recursive) before downloading, so every tracked local CSV is resolved to # its correct upstream path regardless of sub-folder. # # The workflow is intentionally update-only: it never pulls in CSV files that # are not already tracked in this repository. # # After the CSV refresh it runs the production-stats pipeline # (scripts/load-cia-stats.ts + scripts/update-stats-from-cia.ts) so the # homepage statistics (index*.html) stay in sync with # cia-data/production-stats.json derived from extraction_summary_report.csv. # # When either stage produces changes, a single pull request is opened. on: schedule: # Daily at 03:30 UTC (~04:30 CET / 05:30 CEST), safely after the upstream # CIA extraction job which completes around 02:57 UTC. The 30-minute margin # absorbs upstream runtime variance and ensures we refresh from the latest # extraction rather than the previous day's snapshot. - cron: '30 3 * * *' workflow_dispatch: inputs: ref: description: 'Upstream Hack23/cia git ref (branch, tag, or SHA) to download from' required: false type: string default: 'master' permissions: contents: read jobs: refresh: name: Refresh CIA CSV data and production stats runs-on: ubuntu-26.04 permissions: contents: write pull-requests: write env: UPSTREAM_REF: ${{ github.event.inputs.ref || 'master' }} UPSTREAM_REPO: Hack23/cia UPSTREAM_PATH: service.data.impl/sample-data steps: - name: Harden Runner uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4 with: egress-policy: audit allowed-endpoints: > api.github.com:443 github.com:443 raw.githubusercontent.com:443 objects.githubusercontent.com:443 codeload.github.com:443 registry.npmjs.org:443 nodejs.org:443 - name: Checkout repository uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: token: ${{ secrets.GITHUB_TOKEN }} fetch-depth: 1 - name: Setup Node.js uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: '26' cache: 'npm' cache-dependency-path: | package-lock.json .github/workflows/update-cia-csv-data.yml - name: Install dependencies run: npm ci - name: Resolve upstream ref β†’ SHA id: resolve env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -euo pipefail # URL-encode the ref so branch names that legitimately contain '/' # (e.g. "feature/foo") survive interpolation into the GitHub API # path. jq is used as a dependency-free URL-encoder. encoded_ref="$(printf '%s' "$UPSTREAM_REF" | jq -rR @uri)" echo "πŸ”Ž Resolving ${UPSTREAM_REPO}@${UPSTREAM_REF} (encoded: ${encoded_ref})" # Ask the commits endpoint for the fully-resolved SHA. This accepts # branches, tags, and raw SHAs uniformly and returns the tip commit # SHA. All downstream URLs (tree API + raw downloads) use that SHA, # so there's no risk of a mid-run upstream update producing an # inconsistent CSV / stats snapshot, and no further URL-encoding # is required because a SHA is always [0-9a-f]. sha="$(curl --silent --show-error --fail --location --retry 3 --max-time 30 \ -H "Accept: application/vnd.github.v3+json" \ -H "Authorization: Bearer ${GH_TOKEN}" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "https://api.github.com/repos/${UPSTREAM_REPO}/commits/${encoded_ref}" \ | jq -r '.sha')" if [ -z "$sha" ] || [ "$sha" = "null" ]; then echo "::error::Failed to resolve ${UPSTREAM_REPO}@${UPSTREAM_REF} to a commit SHA" exit 1 fi echo "βœ… Resolved SHA: ${sha}" echo "sha=${sha}" >> "$GITHUB_OUTPUT" - name: Build upstream basename β†’ path index id: index env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }} run: | set -euo pipefail echo "πŸ“‘ Fetching recursive tree for ${UPSTREAM_REPO}@${UPSTREAM_SHA}" tree_json="$(mktemp)" curl --silent --show-error --fail --location --retry 3 --max-time 60 \ -H "Accept: application/vnd.github.v3+json" \ -H "Authorization: Bearer ${GH_TOKEN}" \ -H "X-GitHub-Api-Version: 2022-11-28" \ "https://api.github.com/repos/${UPSTREAM_REPO}/git/trees/${UPSTREAM_SHA}?recursive=1" \ -o "$tree_json" # The GitHub "get a tree" endpoint returns truncated=true when the # tree exceeds ~100k entries or 7MB. The upstream Hack23/cia tree is # currently well under those limits (<10k entries), so a single # recursive call is sufficient. If the repo ever grows past the # threshold, this step fails fast rather than silently producing an # incomplete index β€” at which point the workflow should be updated # to fall back to per-subdirectory tree fetches. truncated="$(jq -r '.truncated' "$tree_json")" if [ "$truncated" = "true" ]; then echo "::error::GitHub tree response is truncated; upstream repo has grown past the single-call limit. Update the workflow to paginate sub-trees." exit 1 fi # index.tsv: \t (duplicate basenames rejected) index_file="${RUNNER_TEMP:-/tmp}/cia_sample_data_index.tsv" jq -r --arg prefix "${UPSTREAM_PATH}/" ' .tree[] | select(.type == "blob") | select(.path | startswith($prefix)) | select(.path | endswith(".csv")) | .path | ltrimstr($prefix) ' "$tree_json" | awk -F/ '{ print $NF "\t" $0 }' | sort > "$index_file" total_csvs="$(wc -l < "$index_file")" dup_count="$(cut -f1 "$index_file" | sort | uniq -d | wc -l)" if [ "$dup_count" -gt 0 ]; then echo "::error::Upstream contains ${dup_count} duplicate CSV basenames; basename-based lookup is ambiguous:" cut -f1 "$index_file" | sort | uniq -d exit 1 fi echo "πŸ“š Upstream index: ${total_csvs} CSVs (no duplicate basenames)" echo "index_file=$index_file" >> "$GITHUB_OUTPUT" echo "total_upstream=$total_csvs" >> "$GITHUB_OUTPUT" rm -f "$tree_json" - name: Refresh tracked CSV files from upstream id: refresh env: INDEX_FILE: ${{ steps.index.outputs.index_file }} UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }} run: | set -euo pipefail # Use the resolved SHA (not the raw ref) so downloads are pinned to # the exact same commit the index was built from, and no URL-encoding # is needed for the raw.githubusercontent.com path. BASE_URL="https://raw.githubusercontent.com/${UPSTREAM_REPO}/${UPSTREAM_SHA}/${UPSTREAM_PATH}" echo "πŸ“₯ Source base URL: ${BASE_URL}" echo "πŸ“š Using index: ${INDEX_FILE}" # Only touch CSVs that are ALREADY tracked in the repository under # data/cia/** or cia-data/**. We never introduce new files. mapfile -t LOCAL_CSVS < <( find data/cia cia-data -type f -name '*.csv' 2>/dev/null | sort -u ) total=${#LOCAL_CSVS[@]} updated=0 unchanged=0 missing=0 skipped=0 failed=0 # Locally-curated files with no upstream equivalent; these must not be # overwritten by the upstream sample-data tree. declare -a SKIP_PATTERNS=( # Local hand-curated mini-samples used by dashboard demos; their # schemas (name,ministry,influence / impact / riskScore / quarter) # do not exist in upstream service.data.impl/sample-data. 'data/cia/ministry/sample_influence.csv' 'data/cia/ministry/sample_decision_impact.csv' 'data/cia/ministry/sample_risk_levels.csv' 'data/cia/ministry/sample_productivity.csv' # Locally-generated forecast products (election scenarios) that # are not published in the upstream CIA sample-data tree. 'cia-data/election/election_forecast.csv' 'cia-data/election/coalition_scenarios.csv' ) is_skipped() { local path="$1" for pat in "${SKIP_PATTERNS[@]}"; do if [ "$path" = "$pat" ]; then return 0 fi done return 1 } # Resolve a local basename to its upstream-relative path. Falls back # to the "_sample.csv" alias, which the repo uses for a few # locally-renamed canonical files (e.g. view_riksdagen_committee_decisions.csv). resolve_upstream_path() { local base="$1" local hit hit="$(awk -v b="$base" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")" if [ -n "$hit" ]; then printf '%s' "$hit" return 0 fi local alias="${base%.csv}_sample.csv" hit="$(awk -v b="$alias" -F'\t' '$1==b {print $2; exit}' "$INDEX_FILE")" if [ -n "$hit" ]; then printf '%s' "$hit" return 0 fi return 1 } tmpdir="$(mktemp -d)" trap 'rm -rf "$tmpdir"' EXIT : > "$tmpdir/updated.txt" : > "$tmpdir/missing.txt" : > "$tmpdir/failed.txt" for local_path in "${LOCAL_CSVS[@]}"; do base="$(basename "$local_path")" if is_skipped "$local_path"; then echo " ⏭ skip $local_path (local-only sample, no upstream)" skipped=$((skipped + 1)) continue fi if ! upstream_rel="$(resolve_upstream_path "$base")"; then echo " ❔ miss $local_path (no upstream basename match)" missing=$((missing + 1)) echo "$local_path" >> "$tmpdir/missing.txt" continue fi url="${BASE_URL}/${upstream_rel}" tmpfile="$tmpdir/$base" http_code="$(curl --silent --show-error --location --retry 3 \ --max-time 60 --output "$tmpfile" --write-out '%{http_code}' \ "$url" || echo '000')" case "$http_code" in 200) if [ ! -s "$tmpfile" ]; then echo " ⚠️ empty $local_path ($url)" failed=$((failed + 1)) echo "$local_path" >> "$tmpdir/failed.txt" continue fi if cmp --silent "$tmpfile" "$local_path"; then unchanged=$((unchanged + 1)) else mkdir -p "$(dirname "$local_path")" mv "$tmpfile" "$local_path" updated=$((updated + 1)) echo "$local_path <- ${upstream_rel}" >> "$tmpdir/updated.txt" echo " βœ… update $local_path <- ${upstream_rel}" fi ;; 404) echo " ❔ miss $local_path (404 at ${upstream_rel})" missing=$((missing + 1)) echo "$local_path" >> "$tmpdir/missing.txt" ;; *) echo " ❌ http=$http_code $local_path ($url)" failed=$((failed + 1)) echo "$local_path" >> "$tmpdir/failed.txt" ;; esac done { echo "total=$total" echo "updated=$updated" echo "unchanged=$unchanged" echo "missing=$missing" echo "skipped=$skipped" echo "failed=$failed" } >> "$GITHUB_OUTPUT" { echo "## πŸ“Š CIA CSV Refresh Summary" echo "" echo "| Metric | Count |" echo "| --- | ---: |" echo "| Total tracked CSVs | $total |" echo "| βœ… Updated | $updated |" echo "| 🟰 Unchanged | $unchanged |" echo "| ❔ Missing upstream | $missing |" echo "| ⏭ Skipped (local-only) | $skipped |" echo "| ❌ Failed downloads | $failed |" echo "" echo "- Upstream ref: \`${UPSTREAM_REF}\`" echo "- Source base: ${BASE_URL}" echo "- Upstream index size: ${{ steps.index.outputs.total_upstream }} CSVs" if [ "$updated" -gt 0 ]; then echo "" echo "### Updated files" echo "" sed 's/^/- /' "$tmpdir/updated.txt" fi if [ "$missing" -gt 0 ]; then echo "" echo "### Missing upstream" echo "" sed 's/^/- /' "$tmpdir/missing.txt" fi if [ "$failed" -gt 0 ]; then echo "" echo "### Failed downloads" echo "" sed 's/^/- /' "$tmpdir/failed.txt" fi } >> "$GITHUB_STEP_SUMMARY" if [ "$failed" -gt 0 ]; then echo "::error::$failed CSV downloads failed; see job summary" exit 1 fi - name: Refresh CIA production statistics id: stats env: UPSTREAM_SHA: ${{ steps.resolve.outputs.sha }} # Pin the statistics fetch to the exact same commit SHA the CSV # refresh just used. `scripts/load-cia-stats.ts` respects the # CIA_EXTRACTION_SUMMARY_URL env var as an override of its default # master-branch URL, so the injected counts stay consistent with the # CSV snapshot when the workflow is dispatched with a non-master ref. CIA_EXTRACTION_SUMMARY_URL: https://raw.githubusercontent.com/${{ env.UPSTREAM_REPO }}/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }}/extraction_summary_report.csv run: | set -euo pipefail echo "πŸ“Š Fetching CIA production statistics from ${CIA_EXTRACTION_SUMMARY_URL}" npx --no-install tsx scripts/load-cia-stats.ts if [ ! -f "cia-data/production-stats.json" ]; then echo "::error::cia-data/production-stats.json was not produced" exit 1 fi total_persons="$(jq -r '.counts.total_persons' cia-data/production-stats.json)" total_votes="$(jq -r '.counts.total_votes' cia-data/production-stats.json)" last_updated="$(jq -r '.metadata.last_updated' cia-data/production-stats.json)" { echo "total_persons=$total_persons" echo "total_votes=$total_votes" echo "last_updated=$last_updated" } >> "$GITHUB_OUTPUT" echo "πŸ–Š Injecting statistics into index*.html (14 language variants)" npx --no-install tsx scripts/update-stats-from-cia.ts { echo "" echo "## πŸ“ˆ Production Statistics Refresh" echo "" echo "| Metric | Value |" echo "| --- | ---: |" echo "| Total Persons | ${total_persons} |" echo "| Total Votes | ${total_votes} |" echo "| Last Extraction | ${last_updated} |" echo "| Pinned commit | \`${UPSTREAM_SHA}\` |" } >> "$GITHUB_STEP_SUMMARY" - name: Detect git changes id: diff run: | set -e # Paths touched by this workflow: CSV trees + stats JSON + injected HTML paths=(data/cia cia-data/production-stats.json cia-data/*.csv cia-data/*/*.csv index.html index_*.html) if [ -n "$(git status --porcelain -- "${paths[@]}")" ]; then echo "has_changes=true" >> "$GITHUB_OUTPUT" echo "πŸ“ Changes detected" git status --porcelain -- "${paths[@]}" | head -30 else echo "has_changes=false" >> "$GITHUB_OUTPUT" echo "βœ… No changes detected" fi - name: Create pull request with refreshed data if: steps.diff.outputs.has_changes == 'true' uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1 with: token: ${{ secrets.GITHUB_TOKEN }} add-paths: | data/cia/** cia-data/** index.html index_*.html commit-message: | data(cia): refresh CSV sample data + production stats from ${{ env.UPSTREAM_REPO }}@${{ steps.resolve.outputs.sha }} - Upstream ref: ${{ env.UPSTREAM_REF }} (resolved to ${{ steps.resolve.outputs.sha }}) - CSV files: updated ${{ steps.refresh.outputs.updated }} / ${{ steps.refresh.outputs.total }} tracked - Production stats: ${{ steps.stats.outputs.total_persons }} persons, ${{ steps.stats.outputs.total_votes }} votes - Last extraction: ${{ steps.stats.outputs.last_updated }} branch: data/cia-csv-refresh delete-branch: true title: 'πŸ“Š Refresh CIA data (${{ steps.refresh.outputs.updated }} CSVs + production stats)' body: | ## πŸ“Š Automated CIA data refresh Pulled the latest `service.data.impl/sample-data/` CSVs from [`${{ env.UPSTREAM_REPO }}@${{ env.UPSTREAM_REF }}`](https://github.com/${{ env.UPSTREAM_REPO }}/tree/${{ steps.resolve.outputs.sha }}/${{ env.UPSTREAM_PATH }}) (resolved to commit [`${{ steps.resolve.outputs.sha }}`](https://github.com/${{ env.UPSTREAM_REPO }}/commit/${{ steps.resolve.outputs.sha }})) and re-ran the production-statistics pipeline against the same commit. ### CSV refresh | Metric | Count | | --- | ---: | | Total tracked CSVs | ${{ steps.refresh.outputs.total }} | | βœ… Updated | ${{ steps.refresh.outputs.updated }} | | 🟰 Unchanged | ${{ steps.refresh.outputs.unchanged }} | | ❔ Missing upstream | ${{ steps.refresh.outputs.missing }} | | ⏭ Skipped (local-only) | ${{ steps.refresh.outputs.skipped }} | | ❌ Failed downloads | ${{ steps.refresh.outputs.failed }} | ### Production statistics - Total persons: **${{ steps.stats.outputs.total_persons }}** - Total votes: **${{ steps.stats.outputs.total_votes }}** - Last extraction: **${{ steps.stats.outputs.last_updated }}** ### How resolution works The workflow never introduces new files β€” it only refreshes CSVs already tracked under `data/cia/**` and `cia-data/**`. Upstream paths are resolved through a recursive `basename β†’ upstream-path` index built from the GitHub Tree API, so files living in sub-folders (`distinct_values/`, `framework-validation/`, `risk-rule-tests/`) are located correctly even though local paths are organised differently. An `_sample.csv` alias is applied for the handful of locally-renamed canonical files. ### Review checklist - [ ] CSV diffs look reasonable (no schema regressions) - [ ] `production-stats.json` counts move monotonically or within reason - [ ] Dashboards still render with the refreshed data - [ ] No unexpected file additions or removals > Generated by `.github/workflows/update-cia-csv-data.yml`. labels: | automated-pipeline data-update cia-intelligence # ISMS Compliance # - ISO 27001:2022 A.5.33 Protection of records β€” Git audit trail + PR review gate # - ISO 27001:2022 A.8.3 Information lifecycle management β€” nightly refresh + change detection # - ISO 27001:2022 A.8.10 Information deletion β€” auto-delete PR branch after merge # - ISO 27001:2022 A.8.19 Security in use β€” HTTPS-only downloads, SHA-pinned actions # - NIST CSF 2.0 PR.DS-5 Data integrity β€” byte-level cmp vs upstream # - NIST CSF 2.0 DE.CM-1 Network monitoring β€” step-security/harden-runner egress audit # - CIS Controls v8.1 3.1 Data inventory β€” explicit local-file discovery # - CIS Controls v8.1 3.14 Data integrity validation β€” diff + PR review before merge # - GDPR Article 6(1)(e) β€” public interest processing (democratic transparency) # - Swedish Offentlighetsprincipen β€” public access to government information