#!/usr/bin/env python3
"""cc-audit — lint any CLAUDE.md / AGENTS.md against the 12-rule baseline.

Usage:
    python cc_audit.py                    # scans ./CLAUDE.md, ./AGENTS.md
    python cc_audit.py path/to/file.md    # explicit path
    python cc_audit.py --json             # machine-readable output

Checks:
    1. File exists and is non-empty
    2. Size below the 200-line compliance cliff
    3. Each of the 12 baseline rules has some signal (keyword match)
    4. No forbidden anti-patterns (e.g. leaked paypal links, huge token dumps)
    5. Project-specifics section is present
    6. YAML frontmatter is valid if present

Exit code 0 if the file passes, 1 if it has warnings, 2 if broken.
"""
from __future__ import annotations

import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable

# --- 12-rule keyword signals ---------------------------------------------
# We don't require exact wording; we look for any of the signal words for
# the rule. A rule is "covered" if at least one signal appears somewhere
# in the file. These are intentionally permissive — missing a rule entirely
# is the problem, not paraphrasing it.
RULE_SIGNALS: list[tuple[str, list[str]]] = [
    ("1: think before coding",
     ["assumption", "think before", "surface tradeoffs", "push back"]),
    ("2: simplicity first",
     ["simplicity", "minimum code", "speculative", "simplest"]),
    ("3: surgical changes",
     ["surgical", "touch only", "adjacent code", "match existing style"]),
    ("4: goal-driven execution",
     ["goal-driven", "success criteria", "define success", "until verified"]),
    ("5: don't make the model do non-language work",
     ["non-language", "deterministic code", "deterministic logic",
      "retry policy", "routing is code"]),
    ("6: hard token budget",
     ["token budget", "budget", "spiral", "ceiling", "re-chew"]),
    ("7: surface conflicts",
     ["surface conflict", "two pattern", "pick one", "conflict"]),
    ("8: read before you write",
     ["read before", "understand adjacent", "adjacent code"]),
    ("9: tests gated by correctness",
     ["tests are gated", "behavior, not shape", "assertions", "not just"]),
    ("10: checkpoints for long operations",
     ["checkpoint", "long-running", "commit between", "multi-step"]),
    ("11: convention beats novelty",
     ["convention", "established pattern", "novelty"]),
    ("12: fail visibly",
     ["fail visibly", "partial failure", "silent", "skipped rows",
      "truncated output"]),
]

ANTI_PATTERNS: list[tuple[str, str]] = [
    (r"paypal\.me/[\w-]+", "leaked paypal link — remove before committing"),
    (r"ghp_[A-Za-z0-9]{10,}", "leaked GitHub PAT token"),
    (r"sk-[A-Za-z0-9]{20,}", "leaked API key (sk-...)"),
    (r"\bAKIA[0-9A-Z]{16}\b", "leaked AWS access key"),
    (r"\bpassword\s*[:=]\s*['\"]", "literal password in clear text"),
]

COMPLIANCE_CLIFF = 200  # lines — past this, agent compliance drops sharply
IDEAL_MAX = 150


# --- core logic -----------------------------------------------------------
@dataclass
class Result:
    path: str
    exists: bool
    lines: int = 0
    rules_hit: list[str] = field(default_factory=list)
    rules_missing: list[str] = field(default_factory=list)
    anti_patterns: list[str] = field(default_factory=list)
    has_project_specifics: bool = False
    size_warning: str | None = None
    notes: list[str] = field(default_factory=list)

    @property
    def score(self) -> int:
        """0–100 compliance score."""
        if not self.exists:
            return 0
        base = int(len(self.rules_hit) / len(RULE_SIGNALS) * 80)
        base += 10 if self.has_project_specifics else 0
        base += 10 if not self.anti_patterns else 0
        if self.size_warning:
            base -= 10
        return max(0, min(100, base))

    @property
    def status(self) -> str:
        if not self.exists:
            return "missing"
        if self.anti_patterns:
            return "broken"
        if len(self.rules_missing) > 4 or self.size_warning:
            return "warn"
        return "pass"


def audit(path: Path) -> Result:
    r = Result(path=str(path), exists=path.exists())
    if not r.exists:
        r.notes.append(f"file not found: {path}")
        return r

    text = path.read_text(encoding="utf-8", errors="replace")
    r.lines = text.count("\n") + 1

    low = text.lower()
    for name, signals in RULE_SIGNALS:
        if any(s.lower() in low for s in signals):
            r.rules_hit.append(name)
        else:
            r.rules_missing.append(name)

    for pattern, message in ANTI_PATTERNS:
        if re.search(pattern, text, flags=re.IGNORECASE):
            r.anti_patterns.append(message)

    if re.search(r"project\s*specific", low):
        r.has_project_specifics = True
    else:
        r.notes.append("no 'project specifics' section — add repo-specific rules")

    if r.lines > COMPLIANCE_CLIFF:
        r.size_warning = (
            f"file is {r.lines} lines — past the ~{COMPLIANCE_CLIFF}-line "
            "compliance cliff; compliance drops sharply beyond this"
        )
    elif r.lines > IDEAL_MAX:
        r.notes.append(
            f"file is {r.lines} lines — above the ideal {IDEAL_MAX}; "
            "consider trimming"
        )

    return r


def find_default_files(cwd: Path) -> list[Path]:
    candidates = ["CLAUDE.md", "AGENTS.md", ".cursorrules",
                  ".github/copilot-instructions.md"]
    return [cwd / c for c in candidates if (cwd / c).exists()]


def render_text(results: Iterable[Result]) -> str:
    lines: list[str] = []
    for r in results:
        lines.append(f"=== {r.path} ===")
        if not r.exists:
            lines.append("  ⨯ file not found")
            lines.append("")
            continue
        lines.append(
            f"  status: {r.status.upper()}   score: {r.score}/100   "
            f"lines: {r.lines}"
        )
        lines.append(
            f"  rules covered: {len(r.rules_hit)}/{len(RULE_SIGNALS)}"
        )
        if r.rules_missing:
            lines.append("  missing rules:")
            for m in r.rules_missing:
                lines.append(f"    - {m}")
        if r.anti_patterns:
            lines.append("  anti-patterns:")
            for a in r.anti_patterns:
                lines.append(f"    ⨯ {a}")
        if r.size_warning:
            lines.append(f"  ⚠ {r.size_warning}")
        for n in r.notes:
            lines.append(f"  · {n}")
        lines.append("")
    return "\n".join(lines)


def render_json(results: Iterable[Result]) -> str:
    payload = []
    for r in results:
        payload.append({
            "path": r.path,
            "exists": r.exists,
            "status": r.status,
            "score": r.score,
            "lines": r.lines,
            "rules_hit": r.rules_hit,
            "rules_missing": r.rules_missing,
            "anti_patterns": r.anti_patterns,
            "has_project_specifics": r.has_project_specifics,
            "size_warning": r.size_warning,
            "notes": r.notes,
        })
    return json.dumps(payload, indent=2)


def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
    p.add_argument("paths", nargs="*", type=Path,
                   help="CLAUDE.md / AGENTS.md files to audit (default: "
                        "auto-detect from cwd)")
    p.add_argument("--json", action="store_true",
                   help="emit machine-readable JSON")
    args = p.parse_args(argv)

    if not args.paths:
        args.paths = find_default_files(Path.cwd())
        if not args.paths:
            print("no CLAUDE.md / AGENTS.md found in cwd", file=sys.stderr)
            return 2

    results = [audit(p) for p in args.paths]

    if args.json:
        print(render_json(results))
    else:
        print(render_text(results))

    # exit codes: 0 pass, 1 warn, 2 broken/missing
    worst = 0
    for r in results:
        if r.status in {"missing", "broken"}:
            worst = max(worst, 2)
        elif r.status == "warn":
            worst = max(worst, 1)
    return worst


if __name__ == "__main__":
    sys.exit(main())