#!/usr/bin/env python3
"""
Fable-mode leak test
====================
Measures whether Opus is converging toward Fable's behavioral signature, using
your own Claude Code transcripts in ~/.claude/projects.

It buckets assistant messages into three groups and compares them:
  opus_pre   claude-opus-4-8  BEFORE the governor deploy date  (baseline / "the disease")
  opus_post  claude-opus-4-8  ON/AFTER the deploy date         (governed / "is it working?")
  fable      claude-fable-5   any date                         (the target signature)

Metrics chosen from the 2026-06-14 log analysis (the ones that actually
distinguished the models in execution work):
  - median words / message      Fable ~18  vs un-governed Opus ~47   (lower = better)
  - tool:text ratio             Fable ~3.9 vs un-governed Opus ~1.4  (higher = better)
  - unsolicited-caveat %        armor-hedging rate                   (lower = better)
  - "I'll / Let me" opener %    self-as-actor framing                (lower = better)

Usage:
  python3 leak_test.py
  python3 leak_test.py --since 2026-06-13 --project myproject
  python3 leak_test.py --cap 20000

The "opus_post" sample is small right after deploy and grows as you work — the
verdict marks it INSUFFICIENT until it has enough prose messages to be meaningful.
"""
import json, os, glob, argparse, statistics

CAVEAT = ["to be fair", "that said", "it's worth noting", "it's worth flagging",
          "i should flag", "i should note", "one caveat", "caveat:", "i could be wrong",
          "i might be wrong", "grain of salt", "for what it's worth", "to be clear",
          "honest caveat", "with the caveat", "i want to be careful", "honestly,"]
SELF_OPENERS = ("i'll", "let me", "i will", "let's", "i'm going to", "i can ", "i'd ", "i am going")

DEFAULT_CUTOFF = "2026-06-13"  # governor deployed to global CLAUDE.md on this date

def classify(model, ts, cutoff):
    m = str(model)
    if m.startswith("claude-fable-5"):
        return "fable"
    if m.startswith("claude-opus-4-8"):
        return "opus_post" if (ts and ts[:10] >= cutoff) else "opus_pre"
    return None

def new_acc():
    return dict(n=0, prose=0, words=[], tool=0, text=0, caveat=0, selfopen=0)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--since", default=DEFAULT_CUTOFF, help="governor cutoff date YYYY-MM-DD")
    ap.add_argument("--project", default=None, help="only scan project folders containing this substring")
    ap.add_argument("--cap", type=int, default=15000, help="max messages per bucket (runtime bound)")
    args = ap.parse_args()

    base = os.path.expanduser("~/.claude/projects")
    files = glob.glob(os.path.join(base, "*", "**", "*.jsonl"), recursive=True) + \
            glob.glob(os.path.join(base, "*", "*.jsonl"))
    files = sorted(set(files))
    if args.project:
        files = [f for f in files if args.project.lower() in f.lower()]

    B = {k: new_acc() for k in ("opus_pre", "opus_post", "fable")}
    for f in files:
        if all(B[k]["n"] >= args.cap for k in B):
            break
        try:
            fh = open(f, errors="ignore")
        except OSError:
            continue
        for line in fh:
            try:
                o = json.loads(line)
            except Exception:
                continue
            msg = o.get("message") or {}
            if msg.get("role") != "assistant":
                continue
            b = classify(msg.get("model", ""), o.get("timestamp", ""), args.since)
            if not b or B[b]["n"] >= args.cap:
                continue
            d = B[b]; d["n"] += 1
            c = msg.get("content"); text = ""
            if isinstance(c, str):
                text = c
            elif isinstance(c, list):
                for blk in c:
                    if not isinstance(blk, dict):
                        continue
                    t = blk.get("type")
                    if t == "text":
                        d["text"] += 1; text += blk.get("text", "")
                    elif t == "tool_use":
                        d["tool"] += 1
            if text.strip():
                d["prose"] += 1
                low = text.lower()
                d["words"].append(len(text.split()))
                if any(p in low for p in CAVEAT):
                    d["caveat"] += 1
                if low.lstrip().startswith(SELF_OPENERS):
                    d["selfopen"] += 1

    def med(w, p=50):
        if not w:
            return 0
        if p == 50:
            return int(statistics.median(w))
        return int(statistics.quantiles(w, n=100)[p - 1]) if len(w) > 1 else w[0]

    def metrics(d):
        prose = d["prose"] or 1
        return dict(
            msgs=d["n"], prose=d["prose"],
            p25=med(d["words"], 25), p50=med(d["words"], 50), p75=med(d["words"], 75),
            ttr=d["tool"] / (d["text"] or 1),
            cav=100 * d["caveat"] / prose,
            so=100 * d["selfopen"] / prose,
        )

    M = {k: metrics(v) for k, v in B.items()}
    pre, post, fab = M["opus_pre"], M["opus_post"], M["fable"]

    proj_note = f", project~={args.project}" if args.project else ""
    print(f"\n  Fable-mode leak test   (cutoff {args.since}{proj_note})")
    print("  " + "-" * 74)
    print(f"  {'metric':24}{'opus_pre':>12}{'opus_post':>12}{'FABLE(target)':>16}")
    print("  " + "-" * 74)
    print(f"  {'assistant msgs':24}{pre['msgs']:>12}{post['msgs']:>12}{fab['msgs']:>16}")
    print(f"  {'  w/ prose':24}{pre['prose']:>12}{post['prose']:>12}{fab['prose']:>16}")
    print(f"  {'median words/msg':24}{pre['p50']:>12}{post['p50']:>12}{fab['p50']:>16}")
    print(f"  {'  (p25 / p75)':24}{str(pre['p25'])+'/'+str(pre['p75']):>12}"
          f"{str(post['p25'])+'/'+str(post['p75']):>12}{str(fab['p25'])+'/'+str(fab['p75']):>16}")
    print(f"  {'tool:text ratio':24}{pre['ttr']:>12.2f}{post['ttr']:>12.2f}{fab['ttr']:>16.2f}")
    print(f"  {'unsolicited-caveat %':24}{pre['cav']:>12.1f}{post['cav']:>12.1f}{fab['cav']:>16.1f}")
    so_label = "I'll/Let me opener %"
    print(f"  {so_label:24}{pre['so']:>12.1f}{post['so']:>12.1f}{fab['so']:>16.1f}")
    print("  " + "-" * 74)

    # verdict
    INSUFF = post["prose"] < 30
    def arrow(pre_v, post_v, fab_v, lower_is_better):
        if INSUFF:
            return "—  (insufficient post-governor data; accumulates as you work)"
        toward = (post_v < pre_v) if lower_is_better else (post_v > pre_v)
        # closer to fable than baseline was?
        closed = abs(post_v - fab_v) < abs(pre_v - fab_v)
        mark = "✓ converging" if (toward and closed) else ("✗ not converging" if not toward else "~ moved, check")
        return f"{pre_v:.1f} → {post_v:.1f}  (target {fab_v:.1f})   {mark}"

    print("\n  VERDICT (is governed Opus moving toward Fable?)")
    print(f"    median words   {arrow(pre['p50'], post['p50'], fab['p50'], True)}")
    print(f"    tool:text      {arrow(pre['ttr'], post['ttr'], fab['ttr'], False)}")
    print(f"    caveat %       {arrow(pre['cav'], post['cav'], fab['cav'], True)}")
    print(f"    self-opener %  {arrow(pre['so'], post['so'], fab['so'], True)}")
    if INSUFF:
        print(f"\n  NOTE: only {post['prose']} governed prose msgs so far. Re-run after more"
              f" Opus work for a real verdict.")
    print()

if __name__ == "__main__":
    main()