""" Pillar-1 Evaluation: AI-vs-AI Adversary — Graceful-Degradation Gate (w2-aivai) ============================================================================== Sixth CI-gated eval suite. Loads the deterministic adversary-mutated dataset produced by `scripts/generate_adversary_incidents.py` and grades the same keyword tactic extractor that powers `test_mitre_accuracy.py`. The dataset rewrites every defender-known keyword into evasive synonyms, character obfuscation, and fragmentation across three intensity buckets: heavy ≈ 45% every keyword swapped medium ≈ 35% one expected tactic preserved cleanly light ≈ 20% light leetspeak only (control bucket) Two regression floors are enforced: * **Overall catch rate ≥ 0.40** — under heavy adversarial mutation the defender is expected to drop ~50pp from its 0.95 baseline. The floor of 0.40 keeps "graceful degradation" honest: the substrate must still catch a meaningful fraction, but not anywhere near the unmutated baseline. * **Light-bucket catch rate ≥ 0.85** — light-tier obfuscation is just leetspeak; if the defender silently fails this bucket too, then a heavy-tier failure is hiding a deeper regression. Both floors are intentionally below the 0.80 floor of the unmutated suite. The point of this suite is NOT to prove the defender is great under adversarial pressure — it's to prove the defender doesn't fall off a cliff and to give us a measurable graceful-degradation curve we can watch over time. Run: pytest services/agents/tests/test_adversary_eval.py -v # or via the public eval harness: python scripts/run_evals.py --out eval_report.json """ from __future__ import annotations import json import unittest from pathlib import Path from typing import Any from .test_mitre_accuracy import extract_tactics_from_text # type: ignore _TESTS_DIR = Path(__file__).parent _ADVERSARY_PATH = _TESTS_DIR / "eval_data" / "adversary_incidents.json" _BASE_PATH = _TESTS_DIR / "eval_data" / "synthetic_incidents.json" # Regression floors. See module docstring for rationale. _OVERALL_FLOOR = 0.40 _LIGHT_BUCKET_FLOOR = 0.85 # Heavy-tier upper bound: if the heavy bucket starts catching too much, # either the mutation grammar has drifted off the keyword catalogue or the # defender has silently widened its substring matches. Either way it # means the "adversarial" dataset isn't actually adversarial anymore. _HEAVY_BUCKET_CEILING = 0.50 class AdversaryEvalResult: def __init__(self) -> None: self.total = 0 self.correct = 0 self.bucket_counts: dict[str, int] = {"heavy": 0, "medium": 0, "light": 0} self.bucket_correct: dict[str, int] = {"heavy": 0, "medium": 0, "light": 0} self.lost_all_tactics = 0 self.per_tactic_lost: dict[str, int] = {} self.details: list[dict[str, Any]] = [] @property def accuracy(self) -> float: return self.correct / self.total if self.total else 0.0 def bucket_accuracy(self, bucket: str) -> float: n = self.bucket_counts.get(bucket, 0) return (self.bucket_correct.get(bucket, 0) / n) if n else 0.0 def to_summary(self) -> dict[str, Any]: return { "incidents": self.total, "correct": self.correct, "accuracy": round(self.accuracy, 4), "lost_all_tactics": self.lost_all_tactics, "buckets": { b: { "incidents": self.bucket_counts[b], "correct": self.bucket_correct[b], "accuracy": round(self.bucket_accuracy(b), 4), } for b in ("heavy", "medium", "light") }, "per_tactic_lost": dict(sorted(self.per_tactic_lost.items())), } def _load_adversary_dataset() -> list[dict[str, Any]]: if not _ADVERSARY_PATH.exists(): raise FileNotFoundError( f"Adversary dataset missing at {_ADVERSARY_PATH}. Generate with: python3 scripts/generate_adversary_incidents.py" ) return json.loads(_ADVERSARY_PATH.read_text()) def evaluate_adversary_accuracy() -> AdversaryEvalResult: """Run the keyword tactic extractor against the mutated dataset. Same scoring rule as `test_mitre_accuracy.py`: a case is correct if the predicted tactic set overlaps the expected set by at least one tactic. The point is graceful-degradation, not zero-error detection. """ incidents = _load_adversary_dataset() result = AdversaryEvalResult() for inc in incidents: result.total += 1 bucket = inc.get("adversary_intensity", "heavy") result.bucket_counts[bucket] = result.bucket_counts.get(bucket, 0) + 1 expected = set(inc.get("expected_tactics", [])) text = f"{inc['title']}\n{inc['description']}" predicted = extract_tactics_from_text(text) overlap = predicted & expected correct = bool(overlap) if correct: result.correct += 1 result.bucket_correct[bucket] = result.bucket_correct.get(bucket, 0) + 1 else: result.lost_all_tactics += 1 for t in expected - predicted: result.per_tactic_lost[t] = result.per_tactic_lost.get(t, 0) + 1 result.details.append( { "incident_id": inc.get("id"), "template_id": inc.get("template_id"), "adversary_intensity": bucket, "expected": sorted(expected), "predicted": sorted(predicted), "overlap": sorted(overlap), "correct": correct, } ) return result # --------------------------------------------------------------------------- # pytest tests # --------------------------------------------------------------------------- class TestAdversaryEval(unittest.TestCase): """Sixth CI suite — graceful-degradation under adversarial mutation.""" def test_dataset_present(self) -> None: self.assertTrue( _ADVERSARY_PATH.exists(), f"Adversary dataset missing at {_ADVERSARY_PATH}. Run scripts/generate_adversary_incidents.py to (re)generate it.", ) # The mutated set must mirror the base set 1:1 so per-template # diffs are meaningful. base = json.loads(_BASE_PATH.read_text()) mutated = json.loads(_ADVERSARY_PATH.read_text()) self.assertEqual( len(base), len(mutated), f"Adversary dataset size {len(mutated)} != base dataset size {len(base)}", ) def test_dataset_is_actually_mutated(self) -> None: """Make sure the generator actually changed the text — not a no-op. Some templates legitimately contain no defender keyword the grammar knows about (and the light bucket only applies leetspeak), so a meaningful fraction of the corpus will pass through unchanged. The floor here just guards against the grammar collapsing to a no-op. """ mutated = _load_adversary_dataset() unchanged = sum( 1 for inc in mutated if inc["title"] == inc.get("original_title") and inc["description"] == inc.get("original_description") ) self.assertLess( unchanged, (len(mutated) * 35) // 100, f"{unchanged}/{len(mutated)} incidents unchanged — mutation grammar may have regressed.", ) def test_overall_graceful_degradation(self) -> None: result = evaluate_adversary_accuracy() print( f"\n[eval] Adversary catch rate: {result.correct}/{result.total} = " f"{result.accuracy * 100:.1f}% " f"(heavy={result.bucket_accuracy('heavy') * 100:.1f}%, " f"medium={result.bucket_accuracy('medium') * 100:.1f}%, " f"light={result.bucket_accuracy('light') * 100:.1f}%)" ) self.assertGreaterEqual( result.accuracy, _OVERALL_FLOOR, f"Adversary catch rate {result.accuracy:.1%} below " f"graceful-degradation floor of {_OVERALL_FLOOR:.0%}.\n" + json.dumps(result.to_summary(), indent=2)[:4000], ) def test_light_bucket_still_caught(self) -> None: """Light-tier obfuscation is leetspeak only — defender should pass.""" result = evaluate_adversary_accuracy() light_acc = result.bucket_accuracy("light") self.assertGreaterEqual( light_acc, _LIGHT_BUCKET_FLOOR, f"Light-bucket adversary accuracy {light_acc:.1%} below " f"control floor of {_LIGHT_BUCKET_FLOOR:.0%}. " "Defender keyword extractor may have regressed.", ) def test_heavy_bucket_actually_evades(self) -> None: """Heavy-tier mutation must actually hurt the defender. If heavy catches too much, the dataset isn't adversarial anymore — either the grammar has regressed or the defender has silently widened its substring matches. """ result = evaluate_adversary_accuracy() heavy_acc = result.bucket_accuracy("heavy") self.assertLessEqual( heavy_acc, _HEAVY_BUCKET_CEILING, f"Heavy-bucket adversary accuracy {heavy_acc:.1%} above " f"adversariality ceiling of {_HEAVY_BUCKET_CEILING:.0%}. " "Mutation grammar isn't actually evading detection — " "synonyms may be leaking defender keywords.", ) def test_bucket_distribution(self) -> None: """Heavy bucket must be substantial — otherwise we're not testing it.""" result = evaluate_adversary_accuracy() self.assertGreater( result.bucket_counts["heavy"], result.total // 4, f"Heavy bucket only {result.bucket_counts['heavy']}/{result.total} — mutation distribution may have drifted.", ) self.assertGreater( result.bucket_counts["light"], 0, "Light bucket is empty — no control sample.", ) if __name__ == "__main__": unittest.main()