{
  "schema_version": 1,
  "last_updated": "2026-05-11T01:02:45Z",
  "count": 11,
  "entries": [
    {
      "slug": "9f2e9c5b8e",
      "title": "agent-probe-guard v0.1 — L43 pre_tool detection probe (epiphenomenal-softmax under steering)",
      "author": "caiovicentino",
      "type": "probe-result",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Detection-tier probe for tool-call success in SWE-bench traces. AUROC 0.83 at N=42 with random-feature baseline gap +0.27. Causality protocol verdict is epiphenomenal-softmax: probe DETECTS but cannot LEVER (paper-6 Phase 7 finding).",
      "path": "atlas/2026/9f2e9c5b8e.json",
      "hf_repo_id": "caiovicentino1/agent-probe-guard-qwen36-27b",
      "hf_url": "https://huggingface.co/datasets/caiovicentino1/agent-probe-guard-qwen36-27b",
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes",
      "manifest_sha256": "9f2e9c5b8e4fbb7c7eb4c9290d927e76341006c9d63277da05f4d21a0ab26c9b",
      "created_at": "2026-05-10T17:00:00Z"
    },
    {
      "slug": "8d5df2d5d5",
      "title": "FabricationGuard v2 — L31 cross-task hallucination probe (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "probe-result",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Linear probe on layer 31 residual stream detects confident hallucinations across tasks. HaluEval within 0.90, SimpleQA cross-task 0.88. −88% confident-wrong reduction in SimpleQA. ~1ms sklearn p95 inference.",
      "path": "atlas/2026/8d5df2d5d5.json",
      "hf_repo_id": "openinterp/fabricationguard-qwen36-27b-l31-v2",
      "hf_url": "https://huggingface.co/openinterp/fabricationguard-qwen36-27b-l31-v2",
      "doi": null,
      "paper_url": "https://openinterp.org/products/fabricationguard",
      "manifest_sha256": "8d5df2d5d52521632e01d2e2617c6d6b328b85c756b68e4d2c70a19ba31be9fc",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "49eba51edb",
      "title": "ReasonGuard v0.2 — L55 mid_think CoT faithfulness probe (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "probe-result",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Position-of-faithfulness probe at L55 mid_think token. AUROC 0.888 within GSM8K, 0.605 cross StrategyQA. Honest narrow-scope finding — domain-bound, not universal.",
      "path": "atlas/2026/49eba51edb.json",
      "hf_repo_id": "openinterp/reasonguard-qwen36-27b-l55-mid_think",
      "hf_url": "https://huggingface.co/openinterp/reasonguard-qwen36-27b-l55-mid_think",
      "doi": null,
      "paper_url": null,
      "manifest_sha256": "49eba51edb65b6ee13cfa4363cc4a0939ca704df5f25111b9e066836c9b2b890",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "7a4c7cf42e",
      "title": "CoTGuard v1 — CoT faithfulness probe via Lanham-2023 truncation (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "probe-result",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Linear probe trained on Lanham-2023 truncation-induced unfaithful CoT signal. Detection-tier probe — pending Phase 8 causality verdict (template-locked under steering).",
      "path": "atlas/2026/7a4c7cf42e.json",
      "hf_repo_id": null,
      "hf_url": null,
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes",
      "manifest_sha256": "7a4c7cf42ed9528432f3890e675ee0c4103db9234e7c1c219d212840b0144480",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "60b5c38463",
      "title": "Capability locus on Qwen3.6-27B SWE-bench Pro — 4/4 pre_tool/turn_end sites pushdown-asymmetric",
      "author": "caiovicentino",
      "type": "adversarial-finding",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "α-sweep [-200,+200] on L23/L31/L43/L55 capability probes. All 4 sites show pushdown-asymmetric levers (+34 to +60pp gap vs random control). First causal verdict on capability axis. Refines paper-3 §4.1 L43 finding (was N=54 inflated).",
      "path": "atlas/2026/60b5c38463.json",
      "hf_repo_id": null,
      "hf_url": null,
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/saturation-direction-probe-levers",
      "manifest_sha256": "60b5c384633c58ab55ea5f3db93de7b09b35e584e5c696e05181e9fc2fc64deb",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "7019cff912",
      "title": "Probe-detected grokking in multi-probe DPO (Qwen3.6-27B nb37 v2)",
      "author": "caiovicentino",
      "type": "atlas-entry",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Phase transition (ratio 2.596) in fresh-probe AUROC across 11 nb37 v2 checkpoints. Original FG/RG probes show ZERO effect — DPO learning orthogonal to task-probe axes. Construct-then-compress pattern.",
      "path": "atlas/2026/7019cff912.json",
      "hf_repo_id": "caiovicentino1/openinterp-37v2-multiprobe-dpo-extended",
      "hf_url": "https://huggingface.co/datasets/caiovicentino1/openinterp-37v2-multiprobe-dpo-extended",
      "doi": null,
      "paper_url": null,
      "manifest_sha256": "7019cff91255b679077964591a24794705ec2b20bb58374d2f265af010ca886c",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "e328cd066f",
      "title": "NLA two-tier verbalization — uniform fve_nrm decoupled from category-spread recall (Qwen2.5-7B + Gemma-3-12B)",
      "author": "caiovicentino",
      "type": "atlas-entry",
      "model_id": "Qwen/Qwen2.5-7B-Instruct + google/gemma-3-12b",
      "claim": "N=150. Reconstruction fve_nrm UNIFORM 0.880 across chat/code/reasoning/agent. Keyword recall MASSIVELY category-dependent (chat 0.578 / agent 0.088 = 6.5×). Better-trained NLA → smaller fve_nrm spread but LARGER recall spread (decoupling magnification).",
      "path": "atlas/2026/e328cd066f.json",
      "hf_repo_id": null,
      "hf_url": null,
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/nla-two-tier-verbalization",
      "manifest_sha256": "e328cd066f6ffe53ebb5c139da9a1be16c8a5acd02473806328e6cd0ce1e421c",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "03a6e70bfd",
      "title": "Saturation-direction principle — 5 empirical classes of probe causality (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "atlas-entry",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Unifies 8 probes into 5 causality classes. Saturation-direction principle: probes lever in the direction of baseline residual saturation. L55 reversal in Phase 11e (pushdown→pushup when saturation flips) strongly confirms principle.",
      "path": "atlas/2026/03a6e70bfd.json",
      "hf_repo_id": null,
      "hf_url": null,
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/saturation-direction-probe-levers",
      "manifest_sha256": "03a6e70bfd06a5336ef881c9942a1e19fe000acb9cb6c44c57c5cc07671797d0",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "a0c01e67c9",
      "title": "L55 CoT-Integrity probe is template-locked epiphenomenal (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "adversarial-finding",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "N=240. AUROC 0.91. Bidirectional steering up to α=+200 (>‖residual‖) produces ZERO behavioral change for probe AND random direction. Mechanism: enable_thinking=False chat template injects <think></think> in input tokens — thinking decision is not in residual stream.",
      "path": "atlas/2026/a0c01e67c9.json",
      "hf_repo_id": null,
      "hf_url": null,
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes",
      "manifest_sha256": "a0c01e67c97d6d1beed9539b9259d774c6d67cd8bd7f9dbfafb552572fb48663",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "23bb3f2c30",
      "title": "L43 pre_tool probe is softmax-temp epiphenomenal (Qwen3.6-27B SWE-bench)",
      "author": "caiovicentino",
      "type": "adversarial-finding",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Triple-source convergent verdict on L43 pre_tool probe direction. (1) log-prob proxy with control-token norm: Δrel ≈ 0; (2) single-shot α=+5: 4/4 fails select same tool; (3) continuous α=+5: 3/4 keep same tool. Probe DETECTS but does not LEVER.",
      "path": "atlas/2026/23bb3f2c30.json",
      "hf_repo_id": "caiovicentino1/agent-probe-guard-qwen36-27b",
      "hf_url": "https://huggingface.co/datasets/caiovicentino1/agent-probe-guard-qwen36-27b",
      "doi": null,
      "paper_url": "https://openinterp.org/research/papers/two-forms-epiphenomenal-probes",
      "manifest_sha256": "23bb3f2c303b120e2689f5dbe1c5d55ea40f25e36754f546b493fe52fb30e1d3",
      "created_at": "2026-05-11T01:02:45Z"
    },
    {
      "slug": "bfd84a5c21",
      "title": "Multi-probe ensemble OOD walk-back — 0/3 cross-distribution generalization (Qwen3.6-27B)",
      "author": "caiovicentino",
      "type": "adversarial-finding",
      "model_id": "Qwen/Qwen3.6-27B-Instruct",
      "claim": "Cross-distribution test on TruthfulQA + StrategyQA + TriviaQA. 0/3 survives, mean lift −0.002. nb45 +6.7pp was within-distribution effect. ProbePack universal-middleware framing publicly walked back. FG single probe still valid OOD on factual (TriviaQA 0.710).",
      "path": "atlas/2026/bfd84a5c21.json",
      "hf_repo_id": "caiovicentino1/openinterp-46-cross-distribution-ensemble",
      "hf_url": "https://huggingface.co/datasets/caiovicentino1/openinterp-46-cross-distribution-ensemble",
      "doi": null,
      "paper_url": null,
      "manifest_sha256": "bfd84a5c21c8a80b7078ba6a7c7cc437fb0cf9c123a7fea269439be22369094e",
      "created_at": "2026-05-11T01:02:45Z"
    }
  ]
}