// Honest accuracy probe — runs the locally-built CLI against real npm release
// pairs and classifies each outcome against the author's published bump (the
// oracle). It is the artifact behind the README's "Accuracy & Limitations"
// numbers: anyone can re-run it. Node built-ins only, no dependencies.
//
//   npm run build && node scripts/accuracy-probe.mjs
//
// Frozen scorecard (2026-06-26, semver-checks @ graded confidence):
//   analyzable 37/44 | exact 19 | stricter-than-published 9 | looser 9 | OOM 3 | ERROR 4
//   Of the 9 stricter rows, `--strict` (proven majors only) fires on 4 — real
//   breaks the author under-bumped (p-limit 6.1.0 + ky 1.14.0 added a required
//   property to an exported type, commander 12.1.0/14.0.2 removed/narrowed a
//   public member) — and demotes the other 5 to review-only heuristic majors
//   (equivalence rewrites, input-union widening, return-only generics).
//
// Each row: [package, oldVersion, newVersion, publishedBump, apiShape].
// shapes: pure=pure types, dual=ESM/CJS, esm=single ESM, esmOnly=modern ESM-only,
// ns=namespace/declaration-merging heavy, subpath=multi-subpath exports.
import { spawn } from 'node:child_process';
import path from 'node:path';
import { fileURLToPath } from 'node:url';

const CLI = path.join(path.dirname(fileURLToPath(import.meta.url)), '..', 'bin', 'semver-checks.js');
const TIMEOUT_MS = 150_000;
const CONCURRENCY = 2;

const PAIRS = [
  ['type-fest', '5.6.0', '5.7.0', 'minor', 'pure'],
  ['type-fest', '5.4.1', '5.4.2', 'patch', 'pure'],
  ['type-fest', '4.41.0', '5.0.0', 'major', 'pure'],
  ['utility-types', '3.10.0', '3.11.0', 'minor', 'pure'],
  ['utility-types', '3.6.0', '3.6.1', 'patch', 'pure'],
  ['utility-types', '2.1.0', '3.0.0', 'major', 'pure'],
  ['ts-toolbelt', '9.5.12', '9.5.13', 'patch', 'pure'],
  ['ts-toolbelt', '9.5.13', '9.6.0', 'minor', 'pure'],
  ['commander', '11.1.0', '12.0.0', 'major', 'dual'],
  ['commander', '12.0.0', '12.1.0', 'minor', 'dual'],
  ['commander', '14.0.1', '14.0.2', 'patch', 'dual'],
  ['clsx', '1.2.1', '2.0.0', 'major', 'dual'],
  ['clsx', '2.0.0', '2.1.0', 'minor', 'dual'],
  ['clsx', '2.1.0', '2.1.1', 'patch', 'dual'],
  ['tiny-invariant', '1.2.0', '1.3.0', 'minor', 'dual'],
  ['tiny-invariant', '1.3.2', '1.3.3', 'patch', 'dual'],
  ['uuid', '11.0.4', '11.1.0', 'minor', 'subpath'],
  ['uuid', '11.1.1', '12.0.0', 'major', 'subpath'],
  ['uuid', '13.0.2', '14.0.0', 'major', 'subpath'],
  ['nanoid', '5.0.9', '5.1.0', 'minor', 'esm'],
  ['nanoid', '5.1.15', '5.1.16', 'patch', 'esm'],
  ['mitt', '2.1.0', '3.0.0', 'major', 'esm'],
  ['mitt', '3.0.0', '3.0.1', 'patch', 'esm'],
  ['p-limit', '4.0.0', '5.0.0', 'major', 'esm'],
  ['p-limit', '6.0.0', '6.1.0', 'minor', 'esm'],
  ['escape-string-regexp', '4.0.0', '5.0.0', 'major', 'esm'],
  ['escape-string-regexp', '2.0.0', '3.0.0', 'major', 'esm'],
  ['ky', '1.13.0', '1.14.0', 'minor', 'esm'],
  ['ky', '1.14.2', '1.14.3', 'patch', 'esm'],
  ['ky', '1.14.3', '2.0.0', 'major', 'esm'],
  ['execa', '9.5.0', '9.6.0', 'minor', 'esm'],
  ['execa', '9.6.0', '9.6.1', 'patch', 'esm'],
  ['slugify', '1.6.5', '1.6.6', 'patch', 'esm'],
  ['slugify', '1.5.3', '1.6.0', 'minor', 'esm'],
  ['chalk', '4.1.2', '5.0.0', 'major', 'esmOnly'],
  ['chalk', '5.3.0', '5.4.0', 'minor', 'esmOnly'],
  ['chalk', '5.4.0', '5.4.1', 'patch', 'esmOnly'],
  ['zod', '4.4.0', '4.4.1', 'patch', 'ns'],
  ['zod', '4.4.1', '4.4.2', 'patch', 'ns'],
  ['yargs', '17.7.2', '17.7.3', 'patch', 'ns'],
  ['yargs', '17.7.3', '18.0.0', 'major', 'ns'],
  ['picocolors', '1.0.0', '1.0.1', 'patch', 'esm'],
  ['picocolors', '1.0.1', '1.1.0', 'minor', 'esm'],
  ['picocolors', '1.1.0', '1.1.1', 'patch', 'esm'],
];

const RANK = { patch: 0, minor: 1, major: 2 };

function runOne(pkg, oldV, newV) {
  return new Promise((resolve) => {
    const args = ['compare', `npm:${pkg}@${oldV}`, `npm:${pkg}@${newV}`, '--format', 'json'];
    const child = spawn('node', [CLI, ...args]);
    let out = '', err = '', killed = false;
    const timer = setTimeout(() => { killed = true; child.kill('SIGKILL'); }, TIMEOUT_MS);
    child.stdout.on('data', (d) => (out += d));
    child.stderr.on('data', (d) => (err += d));
    child.on('close', (code, signal) => { clearTimeout(timer); resolve({ code, signal, out, err, killed }); });
  });
}

function classify(r, label) {
  const errl = (r.err || '').toLowerCase();
  if (r.killed) return { status: 'TIMEOUT' };
  if (errl.includes('heap out of memory') || errl.includes('allocation failure')) return { status: 'OOM' };
  if (r.code === 2 || !r.out.trim()) return { status: 'ERROR' };
  let json;
  try { json = JSON.parse(r.out); } catch { return { status: 'PARSEFAIL' }; }
  const rec = json.recommended;
  const verdict = rec === label ? 'exact' : RANK[rec] > RANK[label] ? 'stricter' : 'looser';
  // majorProven / majorReview split the major count by confidence: `--strict`
  // gates only on proven, so `gates` records whether the strict CI gate fires.
  const proven = json.summary?.majorProven ?? json.summary?.major ?? 0;
  const review = json.summary?.majorReview ?? 0;
  return { status: 'OK', recommended: rec, verdict, proven, review, gates: proven > 0 };
}

async function pool(items, n, fn) {
  const out = new Array(items.length);
  let i = 0;
  await Promise.all(Array.from({ length: n }, async () => { while (i < items.length) { const idx = i++; out[idx] = await fn(items[idx], idx); } }));
  return out;
}

const results = await pool(PAIRS, CONCURRENCY, async ([pkg, o, nw, label, shape], idx) => {
  const c = classify(await runOne(pkg, o, nw), label);
  const conf = c.status === 'OK' && (c.proven || c.review) ? ` major{proven:${c.proven},review:${c.review}}` : '';
  console.error(`[${String(idx + 1).padStart(2)}/${PAIRS.length}] ${pkg} ${o}->${nw} (${shape}/${label}) => ${c.status}${c.recommended ? ` rec=${c.recommended} [${c.verdict}]` : ''}${conf}`);
  return { pkg, old: o, new: nw, label, shape, ...c };
});

const c = (f) => results.filter(f).length;
console.error('\n=== shape x outcome ===');
console.error('shape       n  exact strict loose OOM ERR');
for (const s of [...new Set(PAIRS.map((p) => p[4]))]) {
  const rows = results.filter((r) => r.shape === s);
  const k = (f) => String(rows.filter(f).length).padStart(s === 'pure' ? 5 : 5);
  console.error(`${s.padEnd(10)} ${String(rows.length).padStart(2)}  ${k((r) => r.verdict === 'exact')} ${String(rows.filter((r) => r.verdict === 'stricter').length).padStart(5)} ${String(rows.filter((r) => r.verdict === 'looser').length).padStart(5)} ${String(rows.filter((r) => r.status === 'OOM').length).padStart(3)} ${String(rows.filter((r) => r.status === 'ERROR').length).padStart(3)}`);
}
console.error(`\nanalyzable ${c((r) => r.status === 'OK')}/${PAIRS.length} | exact ${c((r) => r.verdict === 'exact')} | stricter ${c((r) => r.verdict === 'stricter')} | looser ${c((r) => r.verdict === 'looser')} | OOM ${c((r) => r.status === 'OOM')} | ERROR ${c((r) => r.status === 'ERROR')}`);

// Graded-confidence view: of the rows stricter than the published bump, how many
// the `--strict` gate still fires on (a proven major) vs. demotes to review-only
// (heuristic). The latter are the false positives graded confidence isolates from
// the gate; the former are real breaks the author under-bumped.
const stricter = results.filter((r) => r.verdict === 'stricter');
console.error(
  `\n--strict gate: fires on ${stricter.filter((r) => r.gates).length}/${stricter.length} stricter-than-published rows ` +
    `(${stricter.filter((r) => !r.gates).length} demoted to review-only). ` +
    `exact rows that still gate (proven major = real break, author under-bumped): ${results.filter((r) => r.verdict === 'exact' && r.gates).length}.`,
);