% This file was adapted from ICLR2022_conference.tex example provided for the ICLR conference \documentclass{article} % For LaTeX2e \usepackage{conference,times} \usepackage{easyReview} \usepackage{algorithm} \usepackage{algorithmic} % Optional math commands from https://github.com/goodfeli/dlbook_notation. \input{math_commands.tex} \usepackage{amsthm,amssymb} \newtheorem{theorem}{Theorem}[section] \newtheorem{corollary}{Corollary}[theorem] \newtheorem{lemma}[theorem]{Lemma} \newtheorem{definition}[theorem]{Definition} % Please leave these options as they are \usepackage{hyperref} \hypersetup{ colorlinks=true, linkcolor=red, filecolor=magenta, urlcolor=blue, citecolor=purple, pdftitle={Certified Quantum Reservoir Advantage Regimes Under Parity Constraints}, pdfpagemode=FullScreen, } \title{Certified Regime Mapping for Quantum Reservoir Computing\\Under Parity-Constrained Evaluation} \author{Marius-Constantin Dinu \\ Independent Researcher \\ \texttt{marius.constantin.dinu@research.example}} \begin{document} \maketitle \begin{abstract} Quantum reservoir computing is frequently evaluated with point-estimate accuracy gains that confound representation effects, readout parity, and computational cost. We present a hybrid formal-and-simulation study of image classification with PCA-encoded inputs that reframes the question as a certified, dataset-conditional regime-mapping problem. The method combines (i) a cost-normalized objective over a finite configuration grid, (ii) a theorem-backed parity gate showing that linearly isomorphic quantum and classical feature spaces cannot support intrinsic readout-stage advantage claims, and (iii) one-sided lower-confidence-bound certification with familywise multiplicity control. Under a fixed policy ($\tau_{\mathrm{eff}}=0.01$, $\tau_{\mathrm{iso}}=0.05$, $\alpha_{\mathrm{FWER}}=0.05$), certified regions are non-empty for Fashion-MNIST and CIFAR10-gray but empty for MNIST, supporting a bounded-advantage interpretation rather than a universal quantum gain claim. The same framework yields auditable negative controls, explicit caveats, and a reproducible path for transferring this evaluation methodology to other reservoir settings. \end{abstract} \section{Introduction} Quantum reservoir computing (QRC) offers a practical route to supervised learning with fixed quantum dynamics and trainable classical readout, but claimed advantages remain difficult to interpret when preprocessing, readout, and resource accounting are not parity-controlled \citep{schuld2021kernel,dong2020recurrent,deLorenzis2025entanglement}. This issue is particularly acute in image classification with PCA-compressed inputs, where easy benchmarks can saturate separability and inflate apparent gains \citep{deLorenzis2024qelm,xiao2017fashion,krizhevsky2009cifar}. Beyond benchmark selection, two additional confounds are central: (i) feature-space isomorphism can make quantum and classical readout objectives equivalent even when implementations differ \citep{schuld2021kernel,dong2020recurrent}, and (ii) gains that ignore runtime, memory, and measurement burden can be operationally misleading \citep{gross2026measurement,mujal2023timeseries}. This paper addresses these confounds with a certification-first methodology designed for CPU-only execution. The core object is not a single leaderboard score, but a bounded set of operating regimes where cost-normalized advantage is statistically certified after theorem-gated parity filtering. This framing aligns advantage claims with what was actually derived and computed, and it yields explicit failure regions, rather than silent overgeneralization. \textbf{Contributions.} \begin{itemize} \item We define a cost-normalized regime objective over a finite sweep domain and formalize advantage as a bounded acceptance set rather than a scalar headline metric. \item We prove complete readout-stage equivalence theorems for linearly isomorphic feature spaces and use them as a hard parity gate before empirical certification. \item We introduce a conservative certification rule combining parity-gate rejection, one-sided lower confidence bounds, and familywise multiplicity control. \item We provide a reproducible simulation campaign showing certified non-empty regions on harder datasets (Fashion-MNIST, CIFAR10-gray) while MNIST remains uncertified under the same policy. \end{itemize} The remainder of the manuscript is organized as follows. \Secref{sec:related} positions this work against prior QRC/QELM literature, \secref{sec:problem} defines the formal setting and proofs, \secref{sec:protocol} details the simulation protocol, and \secref{sec:results} reports certified empirical findings. \section{Related Work and Gap Positioning} \label{sec:related} \subsection{Kernel and Encoding Perspectives} Kernel reinterpretations of supervised quantum models show that much of the modeling behavior is controlled by induced similarity structure and readout regularization, not by quantum labels alone \citep{schuld2021kernel,dong2020recurrent}. In parallel, encoding theory demonstrates that expressive capacity depends strongly on feature-map frequency content and robustness properties \citep{schuld2021encoding,larose2020robust}. These results are strengths because they provide mathematically analyzable structure, but they also expose a limitation in many empirical comparisons: without strict preprocessing and readout parity, attribution to quantum dynamics is ambiguous. \subsection{QRC/QELM Evidence and Entanglement Claims} Recent QELM studies report promising behavior on image tasks with PCA and fixed readout training, while emphasizing dependence on Hamiltonian choice, evolution time, and measurement design \citep{deLorenzis2024qelm,deLorenzis2025entanglement}. Additional work links quantumness indicators and connectivity to memory and feature quality \citep{gotting2023quantumness,hayashi2025featuremap,tran2020higherorder}. The strength of this literature is mechanistic breadth, but two persistent gaps remain: inconsistent uncertainty reporting and limited parity-gated interpretation of reported gains. \subsection{Measurement and Practical Cost Constraints} Measurement protocol and observable design strongly influence performance-cost tradeoffs in reservoir settings \citep{mujal2023timeseries,gross2026measurement}. This motivates explicit cost-normalized objectives and fixed computational budgets, especially under local CPU-only constraints. The main gap motivating our method is therefore not a lack of candidate architectures, but a lack of theorem-aware, policy-certified reporting that can separate supportable claims from artifact-sensitive ones. \section{Problem Setting and Formal Preliminaries} \label{sec:problem} We consider supervised classification with matched splits and preprocessing across quantum and classical pipelines. Let $\train$, $\valid$, and $\test$ denote train/validation/test partitions. A preprocessing map $P_X: \mathbb{R}^{d_0}\rightarrow\mathbb{R}^{X}$ retains $X$ PCA components. For dataset $d\in\{\text{MNIST},\text{Fashion-MNIST},\text{CIFAR10-gray}\}$ and configuration \[ \xi=(X,t,N_q,M,\lambda_r)\in\Xi=\mathcal{X}\times\mathcal{T}\times\mathcal{N}_q\times\mathcal{M}\times\Lambda, \] we evaluate utility and cost under parity-constrained training budgets. \subsection{Objective and Feasible Regimes} Let $J_d^q(\xi)$ and $J_d^c(\xi)$ denote held-out utility (macro-F1), and let $C_d^q(\xi)$ and $C_d^c(\xi)$ denote normalized cost (runtime, memory, and shot proxy). The cost-normalized advantage is \begin{equation} \label{eq:delta} \Delta_d(\xi)=J_d^q(\xi)-J_d^c(\xi)-\lambda_c\big(C_d^q(\xi)-C_d^c(\xi)\big), \end{equation} where $\lambda_c\ge 0$ controls cost penalization. The optimization target and bounded gain set are \begin{equation} \label{eq:regime} \xi_d^{\star}\in\arg\max_{\xi\in\Xi}\Delta_d(\xi),\qquad \mathcal{R}_d(\tau)=\{\xi\in\Xi:\Delta_d(\xi)\ge\tau\}. \end{equation} \Eqref{eq:regime} defines the optimality criterion used in all downstream claims. \subsection{Parity-Equivalence Guardrail} Define feature matrices $Z_q,Z_c\in\mathbb{R}^{n\times m}$ and target matrix $Y\in\mathbb{R}^{n\times C}$. Data-fit minima are \begin{equation} \label{eq:datafit} J_q^{\star}=\min_{W\in\mathbb{R}^{m\times C}}\|Y-Z_qW\|_F^2,\qquad J_c^{\star}=\min_{V\in\mathbb{R}^{m\times C}}\|Y-Z_cV\|_F^2. \end{equation} The ridge objectives with transported metric are \begin{equation} \label{eq:ridge} L_q(W)=\|Y-Z_qW\|_F^2+\alpha\|W\|_F^2, \quad L_c^G(V)=\|Y-Z_cV\|_F^2+\alpha\,\mathrm{tr}(V^\top G V), \end{equation} with $G=(T^{-1})^\top T^{-1}$ when $Z_q=Z_cT$. \begin{theorem}[Data-fit equivalence under invertible transport] \label{thm:datafit} Assume $Z_q=Z_cT$ for an invertible $T\in\mathbb{R}^{m\times m}$. Then $J_q^{\star}=J_c^{\star}$. \end{theorem} \begin{proof} Define the map $\Phi:W\mapsto V=TW$. Because $T$ is invertible, $\Phi$ is bijective over $\mathbb{R}^{m\times C}$. For every $W$, $Z_qW=Z_cTW=Z_cV$, so each predictor generated by $Z_q$ is feasible under $Z_c$. Conversely, for each $V$, $W=T^{-1}V$ gives $Z_cV=Z_cTT^{-1}V=Z_qW$, so feasible predictor sets coincide. Therefore both minima in \eqref{eq:datafit} optimize the same objective over the same predictor set, implying equality. \end{proof} \begin{theorem}[Regularized equivalence under metric transport] \label{thm:ridge} Under the assumptions of Theorem~\ref{thm:datafit}, \[ \min_W L_q(W)=\min_V L_c^G(V),\qquad G=(T^{-1})^\top T^{-1}. \] \end{theorem} \begin{proof} Use the same bijection $V=TW$. The data-fit terms are equal because $Z_qW=Z_cV$. For the regularizer, $W=T^{-1}V$ implies \[ \|W\|_F^2=\mathrm{tr}(W^\top W) =\mathrm{tr}\big(V^\top (T^{-1})^\top T^{-1}V\big)=\mathrm{tr}(V^\top G V). \] Hence $L_q(W)=L_c^G(V)$ pointwise under a bijective reparameterization, so the optimal values are equal. \end{proof} \begin{corollary}[Orthogonal special case] \label{cor:orth} If $T^\top T=I$, then $G=I$ and Theorem~\ref{thm:ridge} reduces to isotropic ridge equivalence. \end{corollary} \begin{proof} When $T^\top T=I$, we have $T^{-1}=T^\top$ and therefore $G=(T^{-1})^\top T^{-1}=TT^\top=I$. Substituting into Theorem~\ref{thm:ridge} gives the claim. \end{proof} \subsection{Statistical Certification Rule} For repeats $k=1,\dots,K$, define per-run differences \[ D_{d,\xi}^{(k)}=(J_{d,q}^{(k)}-J_{d,c}^{(k)})-\lambda_c(C_{d,q}^{(k)}-C_{d,c}^{(k)}). \] Let $\widehat{\Delta}_{d,\xi}$ and $s_{d,\xi}$ denote the sample mean and standard deviation of $D_{d,\xi}^{(k)}$. The one-sided lower confidence bound is \begin{equation} \label{eq:lcb} \mathrm{LCB}_{d,\xi}=\widehat{\Delta}_{d,\xi}-t_{1-\alpha_m,K-1}\frac{s_{d,\xi}}{\sqrt{K}}, \end{equation} where $\alpha_m$ is Holm-adjusted over the tested family. A configuration is certified if and only if the parity gate rejects equivalence and $\mathrm{LCB}_{d,\xi}\ge\tau_{\mathrm{eff}}$. \begin{algorithm}[t] \caption{Certified Regime Mapping with Parity Gate} \label{alg:certified} \begin{algorithmic} \STATE Fix policy $(\tau_{\mathrm{eff}},\tau_{\mathrm{iso}},\alpha_{\mathrm{FWER}})$ and finite grid $\Xi$. \FOR{each dataset $d$ and configuration $\xi\in\Xi$} \STATE Estimate $\widehat{T}$ from paired feature matrices and compute residual ratio $r$ and $\kappa(\widehat{T})$. \STATE Mark parity-equivalent if $r\le\tau_{\mathrm{iso}}$ and conditioning/rank checks pass. \STATE Aggregate repeated-run deltas and compute $\mathrm{LCB}_{d,\xi}$ via \eqref{eq:lcb} with Holm correction. \ENDFOR \STATE Return certified set $\widehat{\mathcal{R}}_d=\{\xi: \text{not parity-equivalent and } \mathrm{LCB}_{d,\xi}\ge\tau_{\mathrm{eff}}\}$. \end{algorithmic} \end{algorithm} \section{Experimental Protocol} \label{sec:protocol} The simulation protocol follows the selected benchmark-first path with fixed parity controls and CPU-only execution. Datasets include MNIST, Fashion-MNIST, and CIFAR10-gray; preprocessing and splits are shared across all baselines; and readout tuning budgets are matched. The default certification policy is $\tau_{\mathrm{eff}}=0.01$, $\tau_{\mathrm{iso}}=0.05$, and $\alpha_{\mathrm{FWER}}=0.05$ with Holm correction. Baselines include classical ESN with ridge readout, RBF-SVM on PCA features, MLP on PCA features, random Fourier features with ridge, and a reduced-entanglement quantum control. Sweep axes cover PCA dimension, evolution time, qubit count, observable budget, and ridge regularization. This setup directly operationalizes \eqref{eq:delta}--\eqref{eq:lcb} and \algref{alg:certified}. \section{Results} \label{sec:results} \subsection{Regime Surfaces and Bounded Advantage} \Figref{fig:regimes} summarizes cost-normalized regime behavior across datasets. Harder datasets exhibit visibly larger high-value cells, while the MNIST panel remains conservative after certification overlays. This pattern matches the bounded-regime interpretation in \secref{sec:problem}, where support depends on region volume rather than isolated peaks. \begin{figure}[t] \centering \includegraphics[width=0.66\linewidth]{figures/F1_F3_regime_maps.pdf} \caption{Multi-panel regime maps across MNIST, Fashion-MNIST, and CIFAR10-gray under fixed parity and policy controls. The heatmap value is the estimated cost-normalized gain and the markers indicate certified cells after parity gating plus multiplicity-corrected lower-confidence testing; the panel contrast shows that certified support concentrates on harder datasets while MNIST remains conservative.} \label{fig:regimes} \end{figure} Table~\ref{tab:cert} provides dataset-level certification volumes at the default policy: CIFAR10-gray has 10 certified cells, Fashion-MNIST has 9, and MNIST has 0. These counts are paired with negative mean $\widehat{\Delta}$ over the full grid, highlighting why certification is necessary: sparse positive regimes can coexist with globally unfavorable averages. \begin{table}[t] \caption{Dataset-level certification summary under the default policy. Certified and uncertified volumes quantify bounded-support behavior and should be interpreted jointly with the confidence rule in \eqref{eq:lcb}.} \label{tab:cert} \centering \small \renewcommand{\arraystretch}{1.1} \setlength{\tabcolsep}{4pt} \begin{tabular}{lccccc} \hline Dataset & Certified & Uncertified & Gate-blocked rate & $\widehat{\Delta}$ mean & $\widehat{\sigma}_{\Delta}$ mean \\ \hline CIFAR10-gray & 10 & 35 & 0.00 & $-0.1288$ & $0.0301$ \\ Fashion-MNIST & 9 & 36 & 0.00 & $-0.1569$ & $0.0307$ \\ MNIST & 0 & 45 & 0.00 & $-0.1900$ & $0.0330$ \\ \hline \end{tabular} \end{table} \subsection{Parity Diagnostics and Theorem-Gate Evidence} \Figref{fig:parity} and Table~\ref{tab:parity} report parity diagnostics used before certification. The theorem-gate rate is zero across all dataset-baseline groups, and residual ratios lie between approximately $0.757$ and $0.835$ with condition numbers spanning roughly $6.9\times 10^2$ to $2.47\times 10^3$. These values indicate that exact equivalence assumptions are not broadly satisfied, justifying empirical certification rather than theorem-only acceptance. \begin{figure}[t] \centering \includegraphics[width=0.66\linewidth]{figures/F4_F5_parity_diagnostics.pdf} \caption{Parity diagnostics for transport residuals and conditioning across evaluated groups. The left panel summarizes residual-ratio behavior and the right panel relates conditioning to performance deltas, showing that numerically unstable regions coincide with noisier estimates and motivating conservative gating before any advantage statement.} \label{fig:parity} \end{figure} \begin{table}[t] \caption{Parity diagnostics aggregated by dataset and baseline. All rows have matched feature rank, while residual and conditioning statistics quantify deviation from exact linear transport assumptions used in Theorems~\ref{thm:datafit} and~\ref{thm:ridge}.} \label{tab:parity} \centering \small \renewcommand{\arraystretch}{1.1} \setlength{\tabcolsep}{4pt} \begin{tabular}{lcccc} \hline Dataset & Rank($Z_q$)=Rank($Z_c$) & Residual ratio range & $\kappa(\widehat{T})$ range & Theorem-gate rate \\ \hline MNIST & yes & $[0.796,\,0.823]$ & $[1124,\,1665]$ & 0.00 \\ Fashion-MNIST & yes & $[0.808,\,0.835]$ & $[691,\,2474]$ & 0.00 \\ CIFAR10-gray & yes & $[0.757,\,0.799]$ & $[1410,\,2298]$ & 0.00 \\ \hline \end{tabular} \end{table} Symbolic verification further supports the formal layer: transport identity, data-fit equivalence substitution, orthogonal special case, and residual-bound checks all evaluate to true in the validation report. This alignment closes the theorem-to-computation trace required for guarded inference. \subsection{Certified Acceptance and Policy Stability} \Figref{fig:certified} visualizes certification rates and policy sweeps. Within the tested grid, certified volumes are invariant for each dataset, so qualitative ranking does not depend on small changes in $\tau_{\mathrm{eff}}$ or $\alpha_{\mathrm{FWER}}$. \begin{figure}[t] \centering \includegraphics[width=0.66\linewidth]{figures/F6_F7_certified_sensitivity.pdf} \caption{Certification outcomes under default and sensitivity settings. The left panel reports certified versus uncertified region behavior by dataset and the right panel sweeps threshold/error-budget settings, showing stable certified volumes across the evaluated policy grid and preserving the same cross-dataset ordering.} \label{fig:certified} \end{figure} Table~\ref{tab:topk} lists representative top certified regimes by one-sided lower confidence bound. The highest entries are concentrated on Fashion-MNIST and CIFAR10-gray, with positive margins that remain above threshold after correction, matching the dataset-level certification summary. \begin{table}[t] \caption{Representative top configurations ranked by one-sided lower confidence bound after multiplicity correction. Certified status is determined by the conjunction of parity-gate rejection and thresholded lower bound.} \label{tab:topk} \centering \small \renewcommand{\arraystretch}{1.1} \setlength{\tabcolsep}{4pt} \begin{tabular}{lccccc} \hline Dataset & $(X,t,N_q,M)$ & $\widehat{\Delta}$ & LCB & Holm reject & Certified \\ \hline CIFAR10-gray & $(32,1.0,6,16)$ & 0.536 & 0.513 & yes & yes \\ Fashion-MNIST & $(32,1.0,6,16)$ & 0.473 & 0.448 & yes & yes \\ CIFAR10-gray & $(64,2.0,8,24)$ & 0.344 & 0.319 & yes & yes \\ Fashion-MNIST & $(64,2.0,8,24)$ & 0.330 & 0.297 & yes & yes \\ Fashion-MNIST & $(48,0.2,6,24)$ & 0.306 & 0.288 & yes & yes \\ \hline \end{tabular} \end{table} \section{Discussion, Limitations, and Future Work} \subsection{Limitations} Three caveats matter for interpretation. First, certification boundaries are policy-dependent, even though the tested grid ($\tau_{\mathrm{eff}}\in\{0,0.005,0.01,0.02\}$, $\alpha_{\mathrm{FWER}}\in\{0.05,0.10\}$) is locally stable and preserves the same certified volumes (Fashion-MNIST: 9, CIFAR10-gray: 10, MNIST: 0). Second, mechanism controls are supportive diagnostics rather than primary acceptance criteria, because the principal claims are anchored to parity-gated, confidence-certified regime evidence. Third, part of the cited 2025--2026 evidence base remains preprint-stage and one architecture reference remains only partially extracted, so external confidence should be maturity-weighted. \subsection{Future Work} The immediate next step is to expand benchmark breadth beyond grayscale image classification while preserving theorem-gated certification. A second step is to deepen mechanism analysis by connecting strategy-level alignment differences to class-conditional error structure, rather than relying on aggregate indicators alone. Finally, independent replication on additional hardware-aware simulators and finalized peer-reviewed updates will improve the external validity of conditional-advantage boundaries. \section{Conclusion} This work reframes quantum-advantage assessment in QRC from point-score comparison to certified regime mapping under explicit formal and statistical controls. By combining \eqref{eq:delta} and \eqref{eq:regime} with theorem-gated parity filtering and LCB-based multiplicity-aware certification, the analysis supports a clear conclusion: advantage is conditional, dataset-dependent, and bounded in configuration space. Harder datasets retain non-empty certified regions under fixed policy, while MNIST remains uncertified, consistent with saturation concerns after PCA preprocessing. The proposed framework is intentionally conservative, but it yields auditable and reproducible claims that can be transferred to broader QRC evaluations. \bibliographystyle{conference} \bibliography{references} \appendix \section{Implementation and Reproducibility Details} All experiments were executed under a CPU-only budget on Apple Silicon with fixed split manifests and matched preprocessing/readout parity across baselines. Repeated-seed evaluation used multi-seed aggregation, one-sided confidence bounds, and Holm familywise correction over the evaluated configuration family. Hyperparameter sweeps covered PCA components, evolution time, qubit count, observable budget, and ridge regularization; policy sweeps covered $\tau_{\mathrm{eff}}\in\{0.0,0.005,0.01,0.02\}$ and $\alpha_{\mathrm{FWER}}\in\{0.05,0.10\}$. Proof reproducibility was enforced by symbolic checks of transport identities and orthogonal special-case reduction, combined with numeric sanity checks for residual-bound inequalities. Reported uncertainty reflects seed-level variability and policy-level sensitivity where applicable. \section{Policy Sensitivity Table} Table~\ref{tab:sensitivity} shows that certified volumes remain unchanged in the tested policy grid. This supports robustness of qualitative conclusions within the explored threshold range, but it does not rule out sensitivity outside that range. \begin{table}[h] \caption{Threshold-policy sensitivity for certified region volume. Each row pair compares two familywise error levels at fixed effect threshold.} \label{tab:sensitivity} \centering \small \renewcommand{\arraystretch}{1.1} \setlength{\tabcolsep}{4pt} \begin{tabular}{cccc} \hline $\tau_{\mathrm{eff}}$ & $\alpha_{\mathrm{FWER}}$ & Fashion-MNIST certified & CIFAR10-gray certified \\ \hline 0.000 & 0.05/0.10 & 9 & 10 \\ 0.005 & 0.05/0.10 & 9 & 10 \\ 0.010 & 0.05/0.10 & 9 & 10 \\ 0.020 & 0.05/0.10 & 9 & 10 \\ \hline \end{tabular} \end{table} \section{Complementary Mechanism Controls} Mechanism controls were restricted to the harder datasets and are presented as complementary evidence only. \Figref{fig:mechanism} and Table~\ref{tab:mechanism} show consistent ordering: kernel-aligned measurement strategies achieve higher alignment and larger cost-normalized gains than random subsets, while reduced-entangling controls remain intermediate. \begin{figure}[h] \centering \includegraphics[width=0.66\linewidth]{figures/F8_mechanism_controls.pdf} \caption{Mechanism-control diagnostics on harder datasets only. The two panels compare cost-normalized gain and kernel-target alignment across control strategies, showing discriminative (non-saturated) alignment behavior and supporting mechanism interpretations as secondary evidence rather than standalone acceptance criteria.} \label{fig:mechanism} \end{figure} \begin{table}[h] \caption{Mechanism-control summary over harder datasets. Values are means across top-$k$ candidate regimes with uncertainty from repeated perturbation settings.} \label{tab:mechanism} \centering \small \renewcommand{\arraystretch}{1.1} \setlength{\tabcolsep}{4pt} \begin{tabular}{lcc} \hline Strategy & Alignment mean range & Cost-normalized gain range \\ \hline Kernel-aligned greedy & $[0.632,\,0.645]$ & $[0.364,\,0.386]$ \\ Full entangling & $[0.629,\,0.629]$ & $[0.337,\,0.357]$ \\ Reduced entangling & $[0.578,\,0.586]$ & $[0.190,\,0.209]$ \\ Random subset & $[0.578,\,0.579]$ & $[0.175,\,0.196]$ \\ \hline \end{tabular} \end{table} \end{document}