name: CI on: push: branches: [main, develop] pull_request: branches: [main, develop] concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: # ────────────────────────────────────────────── # Lint & type-check the Next.js web app # ────────────────────────────────────────────── web-lint: name: Web — Lint & Type-check runs-on: ubuntu-latest defaults: run: working-directory: apps/web steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - uses: actions/setup-node@v6 with: node-version: '22' cache: pnpm cache-dependency-path: pnpm-lock.yaml - run: pnpm install --frozen-lockfile - run: pnpm run lint - run: pnpm run type-check # ────────────────────────────────────────────── # Vitest smoke tests for the Next.js web app # The web app used to have *zero* tests gated in CI — lint + typecheck # were the only signal. These smoke tests render the components a # reviewer cares about (Hero, ComparisonTable, Sidebar, TopBar, # MarketplaceView, CaseWorkspace) and assert on the wording we agreed # to keep honest (no "every commit", alert reduction tagged as a # measurement, etc.). They run with jsdom; no browser required. # ────────────────────────────────────────────── web-test: name: Web — Vitest smoke tests runs-on: ubuntu-latest needs: web-lint defaults: run: working-directory: apps/web steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - uses: actions/setup-node@v6 with: node-version: '22' cache: pnpm cache-dependency-path: pnpm-lock.yaml - run: pnpm install --frozen-lockfile - run: pnpm test # ────────────────────────────────────────────── # Build the Next.js web app # ────────────────────────────────────────────── web-build: name: Web — Build runs-on: ubuntu-latest needs: web-lint defaults: run: working-directory: apps/web steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - uses: actions/setup-node@v6 with: node-version: '22' cache: pnpm cache-dependency-path: pnpm-lock.yaml - run: pnpm install --frozen-lockfile - run: pnpm build # ────────────────────────────────────────────── # @aisoc/mcp — MCP server for Claude/Cursor/Cody # Typechecks, runs the vitest contract tests (config + tools registry + # installer file-write behaviour), and builds the dist bundle so the # `bin/aisoc-mcp` shebang is exercised on the same Node version we # publish under. Lives in the pnpm workspace; we reuse the root # lockfile to keep installs fast and reproducible. # ────────────────────────────────────────────── mcp-test: name: MCP — Type-check, Test & Build runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - uses: actions/setup-node@v6 with: node-version: '22' cache: pnpm cache-dependency-path: pnpm-lock.yaml - name: Install workspace deps run: pnpm install --frozen-lockfile - name: Type-check run: pnpm --filter @aisoc/mcp typecheck - name: Test run: pnpm --filter @aisoc/mcp test - name: Build run: pnpm --filter @aisoc/mcp build # ────────────────────────────────────────────── # Python API & services # ────────────────────────────────────────────── python-lint: name: Python — Lint & Type-check runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' - name: Install Ruff & Mypy # Pin to versions compatible with the constraints declared in each # service's ``pyproject.toml`` (``ruff = "^0.4.4"``, ``mypy = "^1.10.0"``). # The CI gate previously left these unpinned, so every new ruff release # (e.g. 0.15.x adding ``UP037`` enforcement) silently broke main without # any dependency-file change. Dependabot bumps to the pyproject # constraints (PRs #186, #187) update these pins in the same diff. run: pip install 'ruff>=0.4.4,<0.5' 'mypy>=1.10,<2' # All services share the repo-root `ruff.toml`. Linting and formatting # cover every Python service, not just the original four — the gate # used to soft-fail with `|| true`; that's gone, so this is the real # signal. - name: Ruff lint (all services) run: ruff check services/ - name: Ruff format check (all services) run: ruff format --check services/ # The aisoc-api Docker image is built with `services/api` as its build # context, so anything under `services/agents` is *not* shipped in the # container. The natural-language query translator is consumed by both # the agents runtime and the API's `/nl-query/*` endpoints, so we keep # an in-tree vendored mirror at `services/api/app/_vendor/nl_query/`. # This guard fails the build the moment the two trees drift — re-run # `python scripts/sync_vendored_nl_query.py` locally to fix. - name: Verify vendored nl_query is in sync run: python scripts/sync_vendored_nl_query.py --check # Same story for the correlation-narrative builder. It lives canonically # in `services/fusion/app/services/narrative.py` (called at fusion time # to populate `FusedAlert.narrative`) and is mirrored into # `services/api/app/_vendor/narrative.py` so the API can lazily compute # the narrative on first read of an alert whose `narrative` column is # still NULL — without depending on `services/fusion` being inside the # Docker build context. Re-run `python scripts/sync_vendored_narrative.py` # locally if this check fails. - name: Verify vendored narrative builder is in sync run: python scripts/sync_vendored_narrative.py --check python-test: name: Python — Tests runs-on: ubuntu-latest needs: python-lint env: API_DEPS: >- fastapi "pydantic[email]" pydantic-settings "sqlalchemy[asyncio]" asyncpg structlog "python-jose[cryptography]" "passlib[bcrypt]" tenacity pyyaml prometheus-client "opentelemetry-sdk" "opentelemetry-api" "opentelemetry-exporter-otlp-proto-grpc" "opentelemetry-instrumentation-fastapi" "strawberry-graphql[fastapi]" neo4j redis celery httpx aiofiles email-validator PyJWT "sqlglot>=23,<27" aiosqlite pytest pytest-asyncio steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' cache: pip - name: Install API dependencies run: pip install --quiet ${{ env.API_DEPS }} # Boot-test: verify the FastAPI app can actually import without # NameError / ImportError before spending time on the full pytest # suite. PR #152 fixed two import-time NameErrors (model_validator, # rule_tuning) that CI never caught because pytest mocks imports and # never loads the real entrypoint. This step prevents that class of # regression from reaching main again. - name: Boot-test API service (catch import-time errors) working-directory: services/api env: ENVIRONMENT: development SECRET_KEY: ci-dummy-secret-key-at-least-32bytes! DATABASE_URL: postgresql+asyncpg://x:x@localhost/x run: | python -c "from app.main import app; print('api boot-test ok')" - name: Run API tests (unit + GraphQL schema) working-directory: services/api env: ENVIRONMENT: development SECRET_KEY: ci-dummy-secret-key-at-least-32bytes! DATABASE_URL: postgresql+asyncpg://x:x@localhost/x run: python -m pytest tests/ -v --tb=short - name: Run agents unit tests (Pillar-1 eval suites + audit helpers + fusion client) # `test_audit_helpers.py` imports `app.investigator.state` directly # (it stubs out the package __init__ to avoid pulling in LangGraph), # so it only needs stdlib + pydantic — already installed via API_DEPS. # `test_fusion_client.py` (Issue #190) exercises the thin httpx # client that wires `DetectAgent.process` to the fusion service's # `POST /process` endpoint; it mocks the network with respx so the # suite stays offline-safe. The test imports `DetectAgent` from # `app.agents`, which transitively loads sibling sub-agents # (auto_triage / phishing / identity / insider_threat / cloud) that # import `langchain_core` and `langchain_openai` at module scope — # so both packages are required at import time, even though the # fusion-client test itself only calls into `app.tools.fusion`. run: | pip install --quiet pytest pytest-asyncio respx langchain-core langchain-openai cd services/agents python3 -m pytest \ tests/test_mitre_accuracy.py \ tests/test_alert_reduction.py \ tests/test_investigation_completeness.py \ tests/test_response_quality.py \ tests/test_audit_helpers.py \ tests/test_fusion_client.py \ -v --tb=short # ────────────────────────────────────────────── # Python micro-services unit tests # `services/fusion`, `services/honeytokens`, and `services/purple-team` # used to ship with no Python tests gated in CI at all — only ruff + # format check. These suites cover the pure helpers in each service # (alert fingerprinting/correlation keys, honeytoken generators, # ATT&CK coverage matrix builder) plus the thin FastAPI router contracts # that don't need Postgres / Kafka / Redis to exercise (e.g. the # fusion service's ``POST /process`` endpoint added in Issue #190, # which uses an injected fake worker via ``set_worker(...)``). Real # signal, fast — no service boot required. # ────────────────────────────────────────────── python-services-test: name: Python — Service unit tests (fusion, honeytokens, purple-team) runs-on: ubuntu-latest needs: python-lint steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' cache: pip - name: Install minimal deps # ``fastapi`` + ``httpx`` are required by the fusion router-contract # tests (Issue #190): they import ``app.api.router`` and drive it # with FastAPI's ``ASGITransport`` + ``httpx.AsyncClient``. ``aiokafka`` # is needed because ``app/api/router.py`` exposes ``set_worker(...)`` # which type-references ``FusionWorker`` from ``app.workers.consumer``; # importing the router module therefore eagerly evaluates the # consumer module, which itself imports ``aiokafka`` at module scope. # We deliberately do NOT install the full API dep set here — the goal # of this job stays "pure helpers + thin router contracts, no # service boot, no Postgres/Kafka/Redis". run: pip install --quiet pydantic pydantic-settings pytest pytest-asyncio structlog redis aioredis fastapi httpx aiokafka - name: Run fusion unit tests working-directory: services/fusion run: python -m pytest tests/ -v --tb=short - name: Run honeytokens unit tests working-directory: services/honeytokens run: python -m pytest tests/ -v --tb=short - name: Run purple-team unit tests working-directory: services/purple-team run: python -m pytest tests/ -v --tb=short # ────────────────────────────────────────────── # SDK packages (Python) # The three Python SDKs ship as pip packages but never had a CI gate — # regressions could land silently. This job installs each in editable # mode and runs its bundled pytest suite. They're intentionally light # on dependencies (httpx, pydantic, click, pyyaml) so the job is fast. # ────────────────────────────────────────────── sdk-python-test: name: SDK — Python (pytest) runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' cache: pip - name: Install SDKs in editable mode (with dev extras) # Each `[dev]` extra pins pytest + pytest-asyncio; pytest-httpx is # only needed by `aisoc-sdk` (the typed client tests), so we add it # explicitly rather than expanding the package extras. run: | pip install --quiet \ -e packages/plugin-sdk-py \ -e packages/sdk-py \ -e packages/aisoc-cli \ pytest pytest-asyncio pytest-httpx - name: Run aisoc-plugin-sdk tests run: pytest packages/plugin-sdk-py/tests/ -v --tb=short - name: Run aisoc-sdk tests run: pytest packages/sdk-py/tests/ -v --tb=short - name: Run aisoc-cli tests run: pytest packages/aisoc-cli/tests/ -v --tb=short # ────────────────────────────────────────────── # SDK packages (TypeScript) # `@aisoc/sdk` is a typed openapi-fetch client with a self-contained # vitest suite (HTTP mocked, no real server). Same pnpm workspace # pattern as the MCP job — install once at the root, then filter. # ────────────────────────────────────────────── sdk-ts-test: name: SDK — TypeScript (vitest) runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - uses: actions/setup-node@v6 with: node-version: '22' cache: pnpm cache-dependency-path: pnpm-lock.yaml - name: Install workspace deps run: pnpm install --frozen-lockfile - name: Type-check run: pnpm --filter @aisoc/sdk typecheck - name: Test run: pnpm --filter @aisoc/sdk test - name: Build run: pnpm --filter @aisoc/sdk build # ────────────────────────────────────────────── # Go services (enrichment, ingest, demo-producer) # ────────────────────────────────────────────── go-build: name: Go — Build & Vet runs-on: ubuntu-latest strategy: matrix: service: [enrichment, ingest, demo-producer] steps: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: go-version: '1.25' cache-dependency-path: services/${{ matrix.service }}/go.sum - name: Build run: | cd services/${{ matrix.service }} go build ./... - name: Vet run: | cd services/${{ matrix.service }} go vet ./... - name: Test run: | cd services/${{ matrix.service }} go test ./... -count=1 # ────────────────────────────────────────────── # Go SDK (`packages/plugin-sdk-go`) # Lives outside `services/` so the matrix above doesn't pick it up. # Has `client_test.go`, `loader_test.go`, `registry_test.go` already — # they were never gated. Build + vet + test, no soft-fails. # ────────────────────────────────────────────── sdk-go-test: name: SDK — Go (build, vet, test) runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: go-version: '1.25' cache-dependency-path: packages/plugin-sdk-go/go.sum - name: Build run: | cd packages/plugin-sdk-go go build ./... - name: Vet run: | cd packages/plugin-sdk-go go vet ./... - name: Test run: | cd packages/plugin-sdk-go go test ./... -count=1 # ────────────────────────────────────────────── # Pillar-1 Eval Gate: 4-suite eval harness (200-incident synthetic dataset) # - Alert reduction ratio (≥70%) — REAL measurement against a # fixed noisy alert stream # - MITRE ATT&CK tactic accuracy (≥80%) — substrate self-consistency # gate (extractor vs. dataset # that's written to feed it) # - Investigation completeness (≥85%) — substrate self-consistency # gate (templated report vs. # keywords from the same data) # - Response-plan quality (≥80%) — substrate self-consistency # gate (templated plan vs. # rubric using the same data) # All suites run offline (no LLM, no DB) against the deterministic # `services/agents/tests/eval_data/synthetic_incidents.json` dataset. # This harness gates the substrate (extractors, fusion, templates, judges) # — it does NOT call the live LLM agent, so three of the four metrics are # self-consistency gates rather than agent accuracy scores. See # `apps/docs/docs/benchmark.md` for what each suite actually measures. # The job posts a markdown summary, uploads `eval_report.json`, and # commits historical numbers to the `eval-results` branch as # `eval/results/.json` so the public eval harness page can # query them over time. # ────────────────────────────────────────────── p1-eval: name: P1 Eval — Pillar-1 Eval Harness Gate runs-on: ubuntu-latest permissions: contents: write steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' - name: Pull last published baseline from eval-results branch # Wave 2 — w2-dac. We fetch `eval/results/latest.json` from the # `eval-results` orphan branch (published by the publish step below # on main). When present, the eval runner gates a ≥1pp MITRE # regression vs. that baseline; missing baseline = no-op. if: github.event_name == 'pull_request' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -e mkdir -p /tmp/eval-baseline if git fetch origin eval-results:eval-results 2>/dev/null; then git --no-pager show eval-results:eval/results/latest.json \ > /tmp/eval-baseline/latest.json 2>/dev/null \ || rm -f /tmp/eval-baseline/latest.json fi if [ -f /tmp/eval-baseline/latest.json ]; then echo "Baseline found at /tmp/eval-baseline/latest.json" else echo "No baseline yet — first run will publish one." fi - name: Install eval dependencies run: pip install --quiet pyyaml pytest pytest-asyncio structlog pydantic - name: Run all eval suites # On PRs we additionally gate against the published baseline so a # detection-as-code merge that drops MITRE accuracy by ≥ 1pp blocks # CI (exit code 2). On push to main, `--baseline` is omitted so the # publish step below establishes a fresh baseline. run: | if [ -f /tmp/eval-baseline/latest.json ]; then python3 scripts/run_evals.py \ --ci \ --baseline /tmp/eval-baseline/latest.json \ --max-regression-pp 1.0 \ --out eval_report.json else python3 scripts/run_evals.py --ci --out eval_report.json fi - name: Post job summary if: always() run: | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" import json, pathlib d = json.loads(pathlib.Path("eval_report.json").read_text()) print("## AiSOC Pillar-1 Eval Harness — 200-incident synthetic dataset\n") print("Alert-reduction is a real measurement; the other three suites are substrate self-consistency gates.\n") print(f"Dataset: {d['dataset']} ") print(f"Generated: {d['generated_at']}\n") print("| Suite | Metric | Value | Target | Result |") print("|---|---|---:|---:|:---:|") for name, suite in d["suites"].items(): mark = "PASS" if suite["passed"] else "FAIL" print( f"| `{name}` | {suite['metric']} | " f"{suite['value']:.3f} | ≥ {suite['target']:.2f} | {mark} |" ) verdict = "ALL GATES PASSED" if d["all_passed"] else "REGRESSION DETECTED" print(f"\n{verdict}\n") PY - name: Upload eval report if: always() uses: actions/upload-artifact@v7 with: name: aisoc-eval-report path: eval_report.json - name: Publish historical numbers to eval-results branch if: github.event_name == 'push' && github.ref == 'refs/heads/main' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMMIT_SHA: ${{ github.sha }} run: | set -e # Snapshot the report cp eval_report.json /tmp/eval_report.json git fetch origin eval-results:eval-results 2>/dev/null \ && git checkout eval-results \ || git checkout --orphan eval-results # Clear any previous workspace state on the orphan branch if [ ! -f .gitkeep ]; then find . -mindepth 1 -maxdepth 1 \ ! -name '.git' ! -name 'eval' ! -name '.gitkeep' \ -exec rm -rf {} + 2>/dev/null || true fi mkdir -p eval/results cp /tmp/eval_report.json "eval/results/${COMMIT_SHA}.json" # Maintain a stable `latest.json` for the eval harness page badge cp /tmp/eval_report.json eval/results/latest.json # Identity policy: every commit on this repo must be authored by # Beenu Arora . The eval-results branch is no # exception — even though it is machine-written by CI, attributing # it to a separate "AiSOC CI" identity would re-pollute the # contributors graph after the 2026-05-08 history rewrite. git config user.email "beenu@cyble.com" git config user.name "Beenu Arora" git add eval/results git commit -m "eval: ${COMMIT_SHA::7} (auto)" || echo "no changes" git push origin eval-results || echo "push skipped" # ────────────────────────────────────────────── # Pillar-2: Playbook JSON Schema lint # Validates all playbook JSON files against schemas/playbook.schema.json # ────────────────────────────────────────────── p2-playbook-schema-lint: name: P2 — Playbook Schema Lint runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: '3.12' - name: Install jsonschema run: pip install jsonschema - name: Lint playbook files run: python3 scripts/lint_playbooks.py # ────────────────────────────────────────────── # Docker Compose smoke test # ────────────────────────────────────────────── docker-compose-check: name: Docker Compose — Validate runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Validate compose file run: docker compose config --quiet