#!/usr/bin/env python3 """ Fable-mode leak test ==================== Measures whether Opus is converging toward Fable's behavioral signature, using your own Claude Code transcripts in ~/.claude/projects. It buckets assistant messages into three groups and compares them: opus_pre claude-opus-4-8 BEFORE the governor deploy date (baseline / "the disease") opus_post claude-opus-4-8 ON/AFTER the deploy date (governed / "is it working?") fable claude-fable-5 any date (the target signature) Metrics chosen from the 2026-06-14 log analysis (the ones that actually distinguished the models in execution work): - median words / message Fable ~18 vs un-governed Opus ~47 (lower = better) - tool:text ratio Fable ~3.9 vs un-governed Opus ~1.4 (higher = better) - unsolicited-caveat % armor-hedging rate (lower = better) - "I'll / Let me" opener % self-as-actor framing (lower = better) Usage: python3 leak_test.py python3 leak_test.py --since 2026-06-13 --project myproject python3 leak_test.py --cap 20000 The "opus_post" sample is small right after deploy and grows as you work — the verdict marks it INSUFFICIENT until it has enough prose messages to be meaningful. """ import json, os, glob, argparse, statistics CAVEAT = ["to be fair", "that said", "it's worth noting", "it's worth flagging", "i should flag", "i should note", "one caveat", "caveat:", "i could be wrong", "i might be wrong", "grain of salt", "for what it's worth", "to be clear", "honest caveat", "with the caveat", "i want to be careful", "honestly,"] SELF_OPENERS = ("i'll", "let me", "i will", "let's", "i'm going to", "i can ", "i'd ", "i am going") DEFAULT_CUTOFF = "2026-06-13" # governor deployed to global CLAUDE.md on this date def classify(model, ts, cutoff): m = str(model) if m.startswith("claude-fable-5"): return "fable" if m.startswith("claude-opus-4-8"): return "opus_post" if (ts and ts[:10] >= cutoff) else "opus_pre" return None def new_acc(): return dict(n=0, prose=0, words=[], tool=0, text=0, caveat=0, selfopen=0) def main(): ap = argparse.ArgumentParser() ap.add_argument("--since", default=DEFAULT_CUTOFF, help="governor cutoff date YYYY-MM-DD") ap.add_argument("--project", default=None, help="only scan project folders containing this substring") ap.add_argument("--cap", type=int, default=15000, help="max messages per bucket (runtime bound)") args = ap.parse_args() base = os.path.expanduser("~/.claude/projects") files = glob.glob(os.path.join(base, "*", "**", "*.jsonl"), recursive=True) + \ glob.glob(os.path.join(base, "*", "*.jsonl")) files = sorted(set(files)) if args.project: files = [f for f in files if args.project.lower() in f.lower()] B = {k: new_acc() for k in ("opus_pre", "opus_post", "fable")} for f in files: if all(B[k]["n"] >= args.cap for k in B): break try: fh = open(f, errors="ignore") except OSError: continue for line in fh: try: o = json.loads(line) except Exception: continue msg = o.get("message") or {} if msg.get("role") != "assistant": continue b = classify(msg.get("model", ""), o.get("timestamp", ""), args.since) if not b or B[b]["n"] >= args.cap: continue d = B[b]; d["n"] += 1 c = msg.get("content"); text = "" if isinstance(c, str): text = c elif isinstance(c, list): for blk in c: if not isinstance(blk, dict): continue t = blk.get("type") if t == "text": d["text"] += 1; text += blk.get("text", "") elif t == "tool_use": d["tool"] += 1 if text.strip(): d["prose"] += 1 low = text.lower() d["words"].append(len(text.split())) if any(p in low for p in CAVEAT): d["caveat"] += 1 if low.lstrip().startswith(SELF_OPENERS): d["selfopen"] += 1 def med(w, p=50): if not w: return 0 if p == 50: return int(statistics.median(w)) return int(statistics.quantiles(w, n=100)[p - 1]) if len(w) > 1 else w[0] def metrics(d): prose = d["prose"] or 1 return dict( msgs=d["n"], prose=d["prose"], p25=med(d["words"], 25), p50=med(d["words"], 50), p75=med(d["words"], 75), ttr=d["tool"] / (d["text"] or 1), cav=100 * d["caveat"] / prose, so=100 * d["selfopen"] / prose, ) M = {k: metrics(v) for k, v in B.items()} pre, post, fab = M["opus_pre"], M["opus_post"], M["fable"] proj_note = f", project~={args.project}" if args.project else "" print(f"\n Fable-mode leak test (cutoff {args.since}{proj_note})") print(" " + "-" * 74) print(f" {'metric':24}{'opus_pre':>12}{'opus_post':>12}{'FABLE(target)':>16}") print(" " + "-" * 74) print(f" {'assistant msgs':24}{pre['msgs']:>12}{post['msgs']:>12}{fab['msgs']:>16}") print(f" {' w/ prose':24}{pre['prose']:>12}{post['prose']:>12}{fab['prose']:>16}") print(f" {'median words/msg':24}{pre['p50']:>12}{post['p50']:>12}{fab['p50']:>16}") print(f" {' (p25 / p75)':24}{str(pre['p25'])+'/'+str(pre['p75']):>12}" f"{str(post['p25'])+'/'+str(post['p75']):>12}{str(fab['p25'])+'/'+str(fab['p75']):>16}") print(f" {'tool:text ratio':24}{pre['ttr']:>12.2f}{post['ttr']:>12.2f}{fab['ttr']:>16.2f}") print(f" {'unsolicited-caveat %':24}{pre['cav']:>12.1f}{post['cav']:>12.1f}{fab['cav']:>16.1f}") so_label = "I'll/Let me opener %" print(f" {so_label:24}{pre['so']:>12.1f}{post['so']:>12.1f}{fab['so']:>16.1f}") print(" " + "-" * 74) # verdict INSUFF = post["prose"] < 30 def arrow(pre_v, post_v, fab_v, lower_is_better): if INSUFF: return "— (insufficient post-governor data; accumulates as you work)" toward = (post_v < pre_v) if lower_is_better else (post_v > pre_v) # closer to fable than baseline was? closed = abs(post_v - fab_v) < abs(pre_v - fab_v) mark = "✓ converging" if (toward and closed) else ("✗ not converging" if not toward else "~ moved, check") return f"{pre_v:.1f} → {post_v:.1f} (target {fab_v:.1f}) {mark}" print("\n VERDICT (is governed Opus moving toward Fable?)") print(f" median words {arrow(pre['p50'], post['p50'], fab['p50'], True)}") print(f" tool:text {arrow(pre['ttr'], post['ttr'], fab['ttr'], False)}") print(f" caveat % {arrow(pre['cav'], post['cav'], fab['cav'], True)}") print(f" self-opener % {arrow(pre['so'], post['so'], fab['so'], True)}") if INSUFF: print(f"\n NOTE: only {post['prose']} governed prose msgs so far. Re-run after more" f" Opus work for a real verdict.") print() if __name__ == "__main__": main()