{ "schema_version": 1, "last_updated": "2026-05-11T01:02:45Z", "count": 11, "entries": [ { "slug": "9f2e9c5b8e", "title": "agent-probe-guard v0.1 — L43 pre_tool detection probe (epiphenomenal-softmax under steering)", "author": "caiovicentino", "type": "probe-result", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Detection-tier probe for tool-call success in SWE-bench traces. AUROC 0.83 at N=42 with random-feature baseline gap +0.27. Causality protocol verdict is epiphenomenal-softmax: probe DETECTS but cannot LEVER (paper-6 Phase 7 finding).", "path": "atlas/2026/9f2e9c5b8e.json", "hf_repo_id": "caiovicentino1/agent-probe-guard-qwen36-27b", "hf_url": "https://huggingface.co/datasets/caiovicentino1/agent-probe-guard-qwen36-27b", "doi": null, "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes", "manifest_sha256": "9f2e9c5b8e4fbb7c7eb4c9290d927e76341006c9d63277da05f4d21a0ab26c9b", "created_at": "2026-05-10T17:00:00Z" }, { "slug": "8d5df2d5d5", "title": "FabricationGuard v2 — L31 cross-task hallucination probe (Qwen3.6-27B)", "author": "caiovicentino", "type": "probe-result", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Linear probe on layer 31 residual stream detects confident hallucinations across tasks. HaluEval within 0.90, SimpleQA cross-task 0.88. −88% confident-wrong reduction in SimpleQA. ~1ms sklearn p95 inference.", "path": "atlas/2026/8d5df2d5d5.json", "hf_repo_id": "openinterp/fabricationguard-qwen36-27b-l31-v2", "hf_url": "https://huggingface.co/openinterp/fabricationguard-qwen36-27b-l31-v2", "doi": null, "paper_url": "https://openinterp.org/products/fabricationguard", "manifest_sha256": "8d5df2d5d52521632e01d2e2617c6d6b328b85c756b68e4d2c70a19ba31be9fc", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "49eba51edb", "title": "ReasonGuard v0.2 — L55 mid_think CoT faithfulness probe (Qwen3.6-27B)", "author": "caiovicentino", "type": "probe-result", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Position-of-faithfulness probe at L55 mid_think token. AUROC 0.888 within GSM8K, 0.605 cross StrategyQA. Honest narrow-scope finding — domain-bound, not universal.", "path": "atlas/2026/49eba51edb.json", "hf_repo_id": "openinterp/reasonguard-qwen36-27b-l55-mid_think", "hf_url": "https://huggingface.co/openinterp/reasonguard-qwen36-27b-l55-mid_think", "doi": null, "paper_url": null, "manifest_sha256": "49eba51edb65b6ee13cfa4363cc4a0939ca704df5f25111b9e066836c9b2b890", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "7a4c7cf42e", "title": "CoTGuard v1 — CoT faithfulness probe via Lanham-2023 truncation (Qwen3.6-27B)", "author": "caiovicentino", "type": "probe-result", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Linear probe trained on Lanham-2023 truncation-induced unfaithful CoT signal. Detection-tier probe — pending Phase 8 causality verdict (template-locked under steering).", "path": "atlas/2026/7a4c7cf42e.json", "hf_repo_id": null, "hf_url": null, "doi": null, "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes", "manifest_sha256": "7a4c7cf42ed9528432f3890e675ee0c4103db9234e7c1c219d212840b0144480", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "60b5c38463", "title": "Capability locus on Qwen3.6-27B SWE-bench Pro — 4/4 pre_tool/turn_end sites pushdown-asymmetric", "author": "caiovicentino", "type": "adversarial-finding", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "α-sweep [-200,+200] on L23/L31/L43/L55 capability probes. All 4 sites show pushdown-asymmetric levers (+34 to +60pp gap vs random control). First causal verdict on capability axis. Refines paper-3 §4.1 L43 finding (was N=54 inflated).", "path": "atlas/2026/60b5c38463.json", "hf_repo_id": null, "hf_url": null, "doi": null, "paper_url": "https://openinterp.org/research/papers/saturation-direction-probe-levers", "manifest_sha256": "60b5c384633c58ab55ea5f3db93de7b09b35e584e5c696e05181e9fc2fc64deb", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "7019cff912", "title": "Probe-detected grokking in multi-probe DPO (Qwen3.6-27B nb37 v2)", "author": "caiovicentino", "type": "atlas-entry", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Phase transition (ratio 2.596) in fresh-probe AUROC across 11 nb37 v2 checkpoints. Original FG/RG probes show ZERO effect — DPO learning orthogonal to task-probe axes. Construct-then-compress pattern.", "path": "atlas/2026/7019cff912.json", "hf_repo_id": "caiovicentino1/openinterp-37v2-multiprobe-dpo-extended", "hf_url": "https://huggingface.co/datasets/caiovicentino1/openinterp-37v2-multiprobe-dpo-extended", "doi": null, "paper_url": null, "manifest_sha256": "7019cff91255b679077964591a24794705ec2b20bb58374d2f265af010ca886c", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "e328cd066f", "title": "NLA two-tier verbalization — uniform fve_nrm decoupled from category-spread recall (Qwen2.5-7B + Gemma-3-12B)", "author": "caiovicentino", "type": "atlas-entry", "model_id": "Qwen/Qwen2.5-7B-Instruct + google/gemma-3-12b", "claim": "N=150. Reconstruction fve_nrm UNIFORM 0.880 across chat/code/reasoning/agent. Keyword recall MASSIVELY category-dependent (chat 0.578 / agent 0.088 = 6.5×). Better-trained NLA → smaller fve_nrm spread but LARGER recall spread (decoupling magnification).", "path": "atlas/2026/e328cd066f.json", "hf_repo_id": null, "hf_url": null, "doi": null, "paper_url": "https://openinterp.org/research/papers/nla-two-tier-verbalization", "manifest_sha256": "e328cd066f6ffe53ebb5c139da9a1be16c8a5acd02473806328e6cd0ce1e421c", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "03a6e70bfd", "title": "Saturation-direction principle — 5 empirical classes of probe causality (Qwen3.6-27B)", "author": "caiovicentino", "type": "atlas-entry", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Unifies 8 probes into 5 causality classes. Saturation-direction principle: probes lever in the direction of baseline residual saturation. L55 reversal in Phase 11e (pushdown→pushup when saturation flips) strongly confirms principle.", "path": "atlas/2026/03a6e70bfd.json", "hf_repo_id": null, "hf_url": null, "doi": null, "paper_url": "https://openinterp.org/research/papers/saturation-direction-probe-levers", "manifest_sha256": "03a6e70bfd06a5336ef881c9942a1e19fe000acb9cb6c44c57c5cc07671797d0", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "a0c01e67c9", "title": "L55 CoT-Integrity probe is template-locked epiphenomenal (Qwen3.6-27B)", "author": "caiovicentino", "type": "adversarial-finding", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "N=240. AUROC 0.91. Bidirectional steering up to α=+200 (>‖residual‖) produces ZERO behavioral change for probe AND random direction. Mechanism: enable_thinking=False chat template injects in input tokens — thinking decision is not in residual stream.", "path": "atlas/2026/a0c01e67c9.json", "hf_repo_id": null, "hf_url": null, "doi": null, "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes", "manifest_sha256": "a0c01e67c97d6d1beed9539b9259d774c6d67cd8bd7f9dbfafb552572fb48663", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "23bb3f2c30", "title": "L43 pre_tool probe is softmax-temp epiphenomenal (Qwen3.6-27B SWE-bench)", "author": "caiovicentino", "type": "adversarial-finding", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Triple-source convergent verdict on L43 pre_tool probe direction. (1) log-prob proxy with control-token norm: Δrel ≈ 0; (2) single-shot α=+5: 4/4 fails select same tool; (3) continuous α=+5: 3/4 keep same tool. Probe DETECTS but does not LEVER.", "path": "atlas/2026/23bb3f2c30.json", "hf_repo_id": "caiovicentino1/agent-probe-guard-qwen36-27b", "hf_url": "https://huggingface.co/datasets/caiovicentino1/agent-probe-guard-qwen36-27b", "doi": null, "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes", "manifest_sha256": "23bb3f2c303b120e2689f5dbe1c5d55ea40f25e36754f546b493fe52fb30e1d3", "created_at": "2026-05-11T01:02:45Z" }, { "slug": "bfd84a5c21", "title": "Multi-probe ensemble OOD walk-back — 0/3 cross-distribution generalization (Qwen3.6-27B)", "author": "caiovicentino", "type": "adversarial-finding", "model_id": "Qwen/Qwen3.6-27B-Instruct", "claim": "Cross-distribution test on TruthfulQA + StrategyQA + TriviaQA. 0/3 survives, mean lift −0.002. nb45 +6.7pp was within-distribution effect. ProbePack universal-middleware framing publicly walked back. FG single probe still valid OOD on factual (TriviaQA 0.710).", "path": "atlas/2026/bfd84a5c21.json", "hf_repo_id": "caiovicentino1/openinterp-46-cross-distribution-ensemble", "hf_url": "https://huggingface.co/datasets/caiovicentino1/openinterp-46-cross-distribution-ensemble", "doi": null, "paper_url": null, "manifest_sha256": "bfd84a5c21c8a80b7078ba6a7c7cc437fb0cf9c123a7fea269439be22369094e", "created_at": "2026-05-11T01:02:45Z" } ] }