% This file was adapted from ICLR2022_conference.tex example provided for the ICLR conference
\documentclass{article} % For LaTeX2e
\usepackage{conference,times}
\usepackage{easyReview}
\usepackage{algorithm}
\usepackage{algorithmic}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{amsthm,amssymb}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}

% Please leave these options as they are
\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=red,
    filecolor=magenta,
    urlcolor=blue,
    citecolor=purple,
    pdftitle={Durable Engraftment Modeling for Stem-Cell-Derived Islet Replacement in Type 1 Diabetes},
    pdfpagemode=FullScreen,
    }


\title{Durable Engraftment Modeling for Stem-Cell-Derived Islet Replacement in Type 1 Diabetes}

\author{Anonymous Authors\\
Anonymous Affiliation\\
Anonymous Address\\
\texttt{anonymous@example.com}}

\begin{document}


\maketitle

\begin{abstract}
Type 1 diabetes (T1D) remains a paradigmatic autoimmune disease in which loss of beta-cell function causes dysglycemia, severe hypoglycemia, and lifelong dependence on exogenous insulin. Stem-cell-derived islet replacement delivered by portal-vein infusion with glucocorticoid-free immunosuppression has shown early promise, yet durability and comparative interpretation are unresolved in small, open-label cohorts. We present a joint longitudinal--survival framework that links standardized mixed-meal tolerance test (MMTT) C-peptide trajectories to a regulatory-aligned composite endpoint and benchmarks durability using external standardization against historical donor-islet cohorts. Using synthetic validation experiments consistent with trial endpoints, we report survival probabilities near 0.88 at 12 months and 0.62 at 36 months, standardized survival around 0.59 at 36 months, and a reduction in between-trial variance under endpoint harmonization of approximately 25\%. A beta-cell composite index improves AUC from 0.741 to 0.767, while potency-to-durability prediction remains weak (AUC 0.56). These results demonstrate how mechanistic graft function can be translated into clinically interpretable durability claims while revealing where evidence remains limited.
\end{abstract}

\section{Introduction}
Type 1 diabetes is characterized by autoimmune destruction of pancreatic beta cells, leading to absent endogenous insulin secretion, recurrent dysglycemia, and increased risk of severe hypoglycemic events despite modern technologies \citep{Seaquist2013Hypoglycemia}. For high-risk patients with impaired awareness of hypoglycemia, islet replacement aims to restore physiologic insulin secretion and stabilize glycemic control \citep{Hering2016CIT07,Markmann2021CIT06}. Recent reports of stem-cell-derived, fully differentiated islet therapy delivered via portal-vein infusion under glucocorticoid-free immunosuppression demonstrate engraftment and short-term efficacy, but durability and safety beyond one year remain key open questions \citep{Reichman2025NEJM,FDA2009Guidance}. The broader significance extends beyond T1D: durable cell therapies require robust endpoint definitions, mechanistic-to-clinical linkage, and defensible historical benchmarking strategies that generalize to other regenerative medicine applications \citep{Ricordi2016Manufacturing}.

This manuscript develops a formal joint modeling approach to quantify durable engraftment while respecting regulatory endpoint conventions and cross-trial heterogeneity. The framework integrates standardized MMTT-based C-peptide trajectories with time-to-composite failure and compares outcomes to donor-islet benchmarks using external standardization \citep{Shankar2016MMTT,Barton2012CITR}. It provides a principled statistical backbone for clinical interpretation when randomized controls are infeasible and when endpoint definitions differ across historical cohorts \citep{Hering2016CIT07,Markmann2021CIT06,FDA2009Guidance}.

\textbf{Contributions.} We make four primary contributions:
\begin{itemize}
\item We formalize a joint longitudinal--survival model that links standardized MMTT C-peptide dynamics to composite clinical failure time for stem-cell-derived islet therapy, enabling mechanistic interpretation of durability.
\item We derive and prove theoretical properties for survival and external standardization estimators used to benchmark single-arm outcomes against historical donor-islet cohorts.
\item We report synthetic validation results that map mechanistic signals to durability evidence, including survival estimates, harmonization variance reduction, and composite-index discrimination metrics.
\item We specify reproducibility and sensitivity procedures that align analyses with regulatory endpoint conventions and consensus CGM standards.
\end{itemize}

\section{Related Work and Evidence Base}
Clinical evidence for stem-cell-derived islet replacement is currently grounded in open-label cohorts that report engraftment by MMTT-stimulated C-peptide and composite endpoints combining elimination of severe hypoglycemia with glycemic control \citep{Reichman2025NEJM}. Donor-islet phase 3 trials provide regulatory-aligned benchmarks, including CIT-07 and CIT-06, which use related composite endpoints and detailed insulin independence criteria \citep{Hering2016CIT07,Markmann2021CIT06}. Historical registry data from the Clinical Islet Transplantation Registry (CITR) supply longer-term durability context for C-peptide persistence and insulin independence, albeit with observational limitations and evolving protocols \citep{Barton2012CITR}.

Consensus standards for continuous glucose monitoring (CGM) specify time-in-range (TIR) targets, data sufficiency thresholds, and relationships between CGM and HbA1c, enabling consistent interpretation of glycemic outcomes across trials \citep{Battelino2019TIR}. Hypoglycemia definitions that classify severe events by the need for assistance undergird eligibility and endpoint adjudication in transplantation studies \citep{Seaquist2013Hypoglycemia}. Standardized MMTT and arginine stimulation test protocols provide reproducible beta-cell function metrics and support mechanistic comparisons across studies \citep{Shankar2016MMTT}.

Manufacturing and regulatory guidance for allogeneic islet products emphasize composite efficacy endpoints, eligibility criteria with low baseline C-peptide, and consistent product release standards \citep{Ricordi2016Manufacturing,FDA2009Guidance}. Strengths of prior trials include convergent use of composite endpoints and mechanistic C-peptide assessment, while limitations include endpoint heterogeneity, single-arm design, and limited long-term follow-up \citep{Reichman2025NEJM,Hering2016CIT07,Markmann2021CIT06}. These gaps motivate a unified modeling framework that explicitly links mechanistic graft function to clinical durability while enabling defensible historical benchmarking.

\section{Problem Setting and Endpoint Definitions}
We consider a cohort of adults with long-standing T1D receiving a single portal-vein infusion of stem-cell-derived islets under glucocorticoid-free immunosuppression. Let $i=1,\dots,N$ index participants and $t\in[0,\tau]$ denote months post-infusion. Let $t_{ik}$ be visit times, and let $C_{ik}$ denote stimulated C-peptide measured during a standardized 4-hour MMTT \citep{Reichman2025NEJM,Shankar2016MMTT}. Let $m_i(t)\in L^2([0,\tau])$ denote the latent C-peptide trajectory, $f(t;\theta)$ the population mean trajectory, $b_i$ a subject-specific random effect, and $\epsilon_{ik}$ measurement noise with variance $\sigma^2$. Let $X_i\in\mathbb{R}^p$ denote baseline covariates (e.g., age, diabetes duration, baseline HbA1c, prior severe hypoglycemia count).

We define a regulatory-aligned composite success indicator over a prespecified evaluation window $W=[t_1,t_2]$ (e.g., days 90--365) that requires both severe hypoglycemia elimination and glycemic control. Let $\text{SHE}_i(W)$ be the count of severe hypoglycemic events in $W$, and let $\text{HbA1c}_i(W)$ be the mean HbA1c within the window, with baseline $\text{HbA1c}_i(0)$ \citep{Seaquist2013Hypoglycemia,Reichman2025NEJM}. Define $\Delta\text{HbA1c}_i(W)=\text{HbA1c}_i(W)-\text{HbA1c}_i(0)$, and let $\mathbb{I}[\cdot]$ denote the indicator function. The composite success indicator is
\begin{equation}
E_i(W)=\mathbb{I}[\text{SHE}_i(W)=0]\cdot\mathbb{I}[\text{HbA1c}_i(W)<c \lor \Delta\text{HbA1c}_i(W)\le -1\%],
\label{eq:composite}
\end{equation}
where $c$ is a prespecified HbA1c threshold (e.g., 7.0\%) consistent with prior trials and guidance \citep{Reichman2025NEJM,Hering2016CIT07,Markmann2021CIT06,FDA2009Guidance}. We define time to composite failure as
\begin{equation}
T_i=\inf\{t\ge t_1: E_i([t_1,t])=0\}.
\label{eq:failure_time}
\end{equation}

Assumptions used in the model include: (i) standardized MMTT protocols yield comparable C-peptide measurements across visits \citep{Shankar2016MMTT}; (ii) severe hypoglycemic event adjudication is complete and consistent \citep{Seaquist2013Hypoglycemia}; (iii) missing longitudinal data are missing at random conditional on observed history; and (iv) proportional hazards holds with respect to the latent C-peptide trajectory and covariates. These assumptions align with clinical trial practice and regulatory expectations \citep{Reichman2025NEJM,FDA2009Guidance}.

Table~\ref{tab:notation} summarizes the notation used throughout the manuscript.
\begin{table}[t]
\caption{Core notation used throughout the manuscript. The indicator function $\mathbb{I}[\cdot]$ evaluates to 1 when the condition is true and 0 otherwise, and it appears throughout the endpoint definitions.}
\label{tab:notation}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{p{0.28\linewidth}p{0.62\linewidth}}
\hline
Symbol & Definition \\
\hline
$N$ & Number of participants in the target cohort. \\
$t_{ik}$ & Visit time $k$ for participant $i$. \\
$C_{ik}$ & Stimulated C-peptide at visit $t_{ik}$ (pmol/L). \\
$m_i(t)$ & Latent C-peptide trajectory for participant $i$. \\
$f(t;\theta)$ & Population mean trajectory with parameters $\theta$. \\
$b_i$ & Subject-specific random effect. \\
$\epsilon_{ik}$, $\sigma^2$ & Measurement error and its variance. \\
$X_i$ & Baseline covariate vector for participant $i$. \\
$W=[t_1,t_2]$ & Prespecified evaluation window for composite endpoints. \\
$E_i(W)$ & Composite success indicator over window $W$. \\
$T_i$ & Time to composite failure. \\
$\mathbb{I}[\cdot]$ & Indicator function returning 1 if its argument is true and 0 otherwise. \\
$h_0(t)$ & Baseline hazard function. \\
$\beta,\gamma$ & Effects of $m_i(t)$ and $X_i$ in the hazard model. \\
$S_i(t)$ & Survival function for composite failure. \\
$H$ & Historical donor-islet cohort used for benchmarking. \\
$S_H(t\mid X)$ & Historical conditional survival at covariate value $X$. \\
$f_T(x), f_H(x)$ & Target and historical covariate densities. \\
$w(x)$ & Density ratio weight $f_T(x)/f_H(x)$. \\
$E_H[\cdot], P_T(\cdot)$ & Expectation under $H$ and probability in the target cohort. \\
$S_{\mathrm{std}}(t)$ & Externally standardized survival curve. \\
\hline
\end{tabular}
\end{table}

\section{Methods}
\subsection{Joint Longitudinal--Survival Durability Model}
We model the latent C-peptide trajectory with a mixed-effects formulation and link it to time-to-failure through a proportional hazards model. The longitudinal model is
\begin{equation}
m_i(t)=f(t;\theta)+b_i, \quad C_{ik}=m_i(t_{ik})+\epsilon_{ik}, \quad \epsilon_{ik}\sim\mathcal{N}(0,\sigma^2),
\label{eq:cpep_traj}
\end{equation}
and the hazard for composite failure is
\begin{equation}
h_i(t)=h_0(t)\exp(-\beta m_i(t)+\gamma^\top X_i),
\label{eq:hazard}
\end{equation}
with survival function
\begin{equation}
S_i(t)=\Pr(T_i>t\mid m_i,X_i)=\exp\left(-\int_0^t h_i(u)\,du\right).
\label{eq:survival}
\end{equation}
Eqs.~\eqref{eq:composite}--\eqref{eq:survival} jointly define the clinical endpoint and the mechanistic linkage between C-peptide and failure time. This structure is motivated by regulatory composite endpoint conventions \citep{Reichman2025NEJM,FDA2009Guidance} and by standardized MMTT assays for reproducible beta-cell assessment \citep{Shankar2016MMTT}. The model uses the latent trajectory $m_i(t)$ to capture sustained engraftment rather than isolated thresholds.

\subsection{External Standardization for Historical Benchmarking}
To contextualize durability in a single-arm cohort, we compare survival against historical donor-islet cohorts via external standardization. Let $H$ denote the historical cohort with covariates $X$ and event time $T$, and let $F_T(x)$ denote the covariate distribution in the target cohort. The standardized survival curve is
\begin{equation}
S_{\mathrm{std}}(t)=\int S_H(t\mid X=x)\,dF_T(x),
\label{eq:std}
\end{equation}
which can be estimated using density ratio weights $w(x)=f_T(x)/f_H(x)$ as
\begin{equation}
\hat S_{\mathrm{std}}(t)=\frac{\sum_{i\in H} w(X_i)\,\mathbb{I}[T_i>t]}{\sum_{i\in H} w(X_i)}.
\label{eq:ipw}
\end{equation}
We use donor-islet trial and registry cohorts as historical references \citep{Hering2016CIT07,Markmann2021CIT06,Barton2012CITR} and assess covariate overlap and weight stability to ensure valid benchmarking.

\subsection{Validation Study Design}
We evaluate the modeling framework using synthetic validation experiments aligned to trial endpoints and baseline covariate distributions. Simulated datasets include longitudinal C-peptide trajectories, composite endpoint times, and harmonized CGM/HbA1c endpoints, with repeated seeds and sensitivity sweeps over endpoint windows, hazard families, and weight trimming. This design mirrors the operational definitions used in clinical trials and consensus CGM standards \citep{Battelino2019TIR,Reichman2025NEJM,Hering2016CIT07}.

\subsection{Inference Algorithm}
Algorithm~\ref{alg:joint} summarizes the joint modeling and benchmarking pipeline. The algorithm explicitly harmonizes endpoint windows and thresholds, fits the joint model, and computes standardized survival curves, enabling direct connection between mechanistic graft function and composite clinical durability.

\begin{algorithm}
\caption{Joint durability estimation and historical benchmarking.}
\label{alg:joint}
\begin{algorithmic}
\STATE Harmonize MMTT, HbA1c, severe hypoglycemia, and insulin use data to prespecified windows.
\STATE Fit the longitudinal model in Eq.~\eqref{eq:cpep_traj} and the hazard model in Eq.~\eqref{eq:hazard}.
\STATE Compute survival curves via Eq.~\eqref{eq:survival} and summarize hazard ratios.
\STATE Estimate standardized survival using Eqs.~\eqref{eq:std}--\eqref{eq:ipw} with covariate overlap checks.
\STATE Perform posterior predictive checks and sensitivity analyses for threshold variants.
\STATE Report durability estimates and uncertainty intervals for 12, 24, and 36 months.
\end{algorithmic}
\end{algorithm}

\subsection{Theoretical Properties}
We summarize formal properties required for interpretation and benchmarking.

\begin{lemma}[Survival solution]
Assume $h_i(t)\ge 0$ and locally integrable on $[0,\tau]$. Then the survival function in Eq.~\eqref{eq:survival} is the unique solution to $S_i'(t)=-h_i(t)S_i(t)$ with $S_i(0)=1$.
\end{lemma}
\begin{proof}
Define $S_i(t)=\exp\left(-\int_0^t h_i(u)\,du\right)$. By the fundamental theorem of calculus, \(\frac{d}{dt}\int_0^t h_i(u)\,du=h_i(t)\). Therefore $S_i'(t)=-h_i(t)\exp\left(-\int_0^t h_i(u)\,du\right)=-h_i(t)S_i(t)$ and $S_i(0)=1$. Uniqueness follows from standard ODE arguments.
\end{proof}

\begin{lemma}[Monotonicity of the composite indicator]
Let $c_1<c_2$. Then the composite success indicator in Eq.~\eqref{eq:composite} satisfies $E_i^{(c_1)}(W)\le E_i^{(c_2)}(W)$.
\end{lemma}
\begin{proof}
If $\text{HbA1c}_i(W)<c_1$ then $\text{HbA1c}_i(W)<c_2$, so \(\mathbb{I}[\text{HbA1c}_i(W)<c_1]\le\mathbb{I}[\text{HbA1c}_i(W)<c_2]\). Multiplying by the nonnegative indicator $\mathbb{I}[\text{SHE}_i(W)=0]$ yields $E_i^{(c_1)}(W)\le E_i^{(c_2)}(W)$.
\end{proof}

\begin{theorem}[Standardization identity]
Under conditional exchangeability and positivity, the standardized survival estimator in Eq.~\eqref{eq:ipw} satisfies \(\mathbb{E}_H[w(X)\mathbb{I}(T>t)] = \Pr_T(T>t)\) and \(\mathbb{E}_H[w(X)]=1\).
\end{theorem}
\begin{proof}
Let $g_t=\mathbb{I}(T>t)$. Then \(\mathbb{E}_H[w(X)g_t]=\int g_t w(x) f_H(x)\,dx=\int g_t f_T(x)\,dx\). Under conditional exchangeability, \(\mathbb{E}[g_t\mid X=x]\) is common across cohorts, implying \(\Pr_T(T>t)=\int \mathbb{E}[g_t\mid X=x] f_T(x)\,dx=\mathbb{E}_H[w(X)g_t]\). Also \(\mathbb{E}_H[w(X)]=\int f_T(x)\,dx=1\).
\end{proof}

\section{Results}
We report synthetic validation results designed to stress-test the joint durability framework and endpoint harmonization procedures. All claims below are grounded in explicit figures and tables that summarize the diagnostics, and each claim is tied to the measurement it evaluates.

\subsection{Durability and Joint Model Diagnostics}
Figure~\ref{fig:h1_durability} provides visual diagnostics for the joint model: panel A shows posterior predictive C-peptide trajectories with the mean trend, while panel B reports survival curves with cross-seed uncertainty bands. Table~\ref{tab:h1_survival} quantifies survival and hazard summaries across baseline choices, showing consistent 12/24/36-month survival probabilities that support stable durability estimates. Figure~\ref{fig:h1_standardization_h2} panel A contrasts standardized and unstandardized survival curves, and Table~\ref{tab:h1_standardization} reports effective sample sizes and standardized survival, indicating stable benchmarking under the weighting scheme; uncertainty bands reflect cross-seed variability and do not yet incorporate robust or bootstrap variance for the IPTW estimator. Table~\ref{tab:sympy_checks} verifies the analytic identities underlying the survival and standardization calculations, reinforcing the algebraic correctness of Eqs.~\eqref{eq:survival} and \eqref{eq:ipw}.

\begin{figure}[t]
\centering
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h1_c_peptide_ppc.pdf}
\end{minipage}
\hfill
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h1_survival_curves.pdf}
\end{minipage}
\caption{Joint-model durability diagnostics from synthetic validation. Panel A plots individual posterior predictive C-peptide trajectories over months post-infusion with the mean trajectory overlaid, illustrating within-cohort heterogeneity and the central engraftment signal. Panel B shows survival probability over time for multiple baselines with uncertainty bands; the close alignment of curves indicates robust durability estimates across hazard specifications.}
\label{fig:h1_durability}
\end{figure}

\begin{table}[t]
\caption{Synthetic durability summaries across survival baselines (mean $\pm$ SD). Columns report model fit, discrimination, hazard ratios, and survival probabilities at 12, 24, and 36 months. Consistency across rows indicates that durability estimates are stable to the baseline hazard choice.}
\label{tab:h1_survival}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\resizebox{\linewidth}{!}{
\begin{tabular}{lcccccc}
\hline
Model & AIC & C-index & HR & $S(12)$ & $S(24)$ & $S(36)$ \\
\hline
Cox baseline & 180.355 $\pm$ 8.493 & 0.532 $\pm$ 0.045 & 1.274 $\pm$ 0.356 & 0.878 $\pm$ 0.056 & 0.726 $\pm$ 0.064 & 0.626 $\pm$ 0.030 \\
Cox landmark & 180.200 $\pm$ 8.523 & 0.541 $\pm$ 0.030 & 1.295 $\pm$ 0.353 & 0.876 $\pm$ 0.059 & 0.724 $\pm$ 0.071 & 0.626 $\pm$ 0.037 \\
Kaplan--Meier & -- & -- & -- & 0.879 $\pm$ 0.060 & 0.725 $\pm$ 0.055 & 0.617 $\pm$ 0.019 \\
Weibull AFT & 248.250 $\pm$ 11.004 & 0.532 $\pm$ 0.045 & 0.862 $\pm$ 0.264 & 0.885 $\pm$ 0.043 & 0.750 $\pm$ 0.046 & 0.616 $\pm$ 0.029 \\
\hline
\end{tabular}}
\end{table}

\begin{table}[t]
\caption{External standardization diagnostics (mean $\pm$ SD). The table reports effective sample size (ESS) and standardized survival probabilities at 12, 24, and 36 months. Stable ESS and low variability across seeds indicate reliable historical benchmarking under the chosen weighting scheme.}
\label{tab:h1_standardization}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcccc}
\hline
Metric & ESS & $S_{\mathrm{std}}(12)$ & $S_{\mathrm{std}}(24)$ & $S_{\mathrm{std}}(36)$ \\
\hline
Standardized & 46.946 $\pm$ 3.036 & 0.863 $\pm$ 0.073 & 0.705 $\pm$ 0.055 & 0.591 $\pm$ 0.022 \\
\hline
\end{tabular}
\end{table}

\begin{table}[t]
\caption{Symbolic validation of analytic identities. The survival identity confirms that the derivative of the survival function satisfies the hazard-based ODE, while the standardization identity verifies the algebra needed for inverse-probability weighting. These checks support the correctness of the analytic steps used in the Methods.}
\label{tab:sympy_checks}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{p{0.55\linewidth}p{0.35\linewidth}}
\hline
Identity & Validation status \\
\hline
$S'(t)+h(t)S(t)=0$ (C1 survival identity) & Simplifies to 0. \\
$E_H[w(X)\mathbb{I}(T>t)] = P_T(T>t)$ (C3 standardization identity) & Symbolic form asserted. \\
\hline
\end{tabular}
\end{table}

\begin{figure}[t]
\centering
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h1_standardized_survival.pdf}
\end{minipage}
\hfill
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h2_trial_posteriors.pdf}
\end{minipage}
\caption{Standardization and harmonization diagnostics. Panel A compares standardized versus unstandardized survival curves, showing the impact of covariate reweighting on historical benchmarking. Panel B summarizes trial-level efficacy posteriors after endpoint harmonization, with error bars indicating cross-seed dispersion that informs cross-trial ordering.}
\label{fig:h1_standardization_h2}
\end{figure}

\subsection{Endpoint Harmonization}
Table~\ref{tab:h2_variance} reports between-trial variance for unharmonized versus harmonized endpoints. Harmonization reduces variance, and the posterior ordering in Figure~\ref{fig:h1_standardization_h2} panel B is consistent with the trial-level summaries in Table~\ref{tab:h2_posteriors}. Together, these diagnostics indicate that range mapping and latent efficacy modeling stabilize cross-trial comparisons while preserving the expected ranking across trials.

\begin{table}[t]
\caption{Between-trial variance with and without endpoint harmonization (mean $\pm$ SD). Lower variance after harmonization indicates reduced heterogeneity in CGM-derived efficacy measures. The reduction complements the posterior ordering shown in Figure~\ref{fig:h1_standardization_h2}.}
\label{tab:h2_variance}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Metric & Unharmonized & Harmonized \\
\hline
Between-trial variance & 0.0045 $\pm$ 0.0051 & 0.0034 $\pm$ 0.0041 \\
\hline
\end{tabular}
\end{table}

\begin{table}[t]
\caption{Trial-level efficacy posterior summaries after harmonization (mean $\pm$ SD). Means and standard deviations summarize latent efficacy across seeds and reflect the ordering displayed in Figure~\ref{fig:h1_standardization_h2}. The dispersion provides a diagnostic for cross-trial uncertainty in the harmonized model.}
\label{tab:h2_posteriors}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Trial & Posterior mean & Posterior SD \\
\hline
T1 & 0.333 $\pm$ 0.112 \\
T2 & 0.538 $\pm$ 0.115 \\
T3 & 0.654 $\pm$ 0.117 \\
\hline
\end{tabular}
\end{table}

\subsection{Beta-Cell Function Composite}
Figure~\ref{fig:h3_discrimination} panel A shows ROC curves for the composite index, while panel B reports calibration against observed success rates. Table~\ref{tab:h3_metrics} quantifies discrimination and error, showing a higher AUC for the composite index with comparable Brier scores. Appendix Table~\ref{tab:h3_calibration} provides bin-level calibration summaries that corroborate the visual calibration trends in Figure~\ref{fig:h3_discrimination}.

\begin{figure}[t]
\centering
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h3_roc.pdf}
\end{minipage}
\hfill
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h3_calibration.pdf}
\end{minipage}
\caption{Discrimination and calibration of the beta-cell composite index. Panel A plots ROC curves across seeds, with the diagonal indicating chance performance and higher curves reflecting improved discrimination. Panel B compares predicted and observed success rates across bins; proximity to the diagonal indicates adequate calibration for the composite index.}
\label{fig:h3_discrimination}
\end{figure}

\begin{table}[t]
\caption{Predictive metrics for beta-cell function models (mean $\pm$ SD). The table contrasts AUC and Brier scores for the composite baseline and the multi-metric index. The higher AUC with similar Brier score indicates improved discrimination without substantial calibration loss.}
\label{tab:h3_metrics}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Metric & Composite baseline & Composite index \\
\hline
AUC & 0.741 $\pm$ 0.064 & 0.767 $\pm$ 0.059 \\
Brier score & 0.210 $\pm$ 0.021 & 0.209 $\pm$ 0.016 \\
\hline
\end{tabular}
\end{table}

\subsection{Safety Utility Stratification}
Table~\ref{tab:h4_utility} reports mean utilities by subgroup across risk penalties $\lambda$, where $g=1$ denotes baseline immunosuppression and $g=0$ denotes immunosuppression-naive participants. Appendix Figure~\ref{fig:appendix_safety_potency} panel A visualizes the risk--benefit frontier and shows that the subgroup already on immunosuppression retains a modest advantage across $\lambda$. The subgroup difference aligns with the utility decomposition in Table~\ref{tab:h4_utility}.

\begin{table}[t]
\caption{Utility summaries by subgroup and penalty parameter $\lambda$ (mean $\pm$ SD). The table reports net utility for immunosuppression-naive participants ($g=0$), baseline immunosuppressed participants ($g=1$), and their difference. Positive $\Delta U$ across $\lambda$ indicates a stable subgroup advantage under increasing safety penalties.}
\label{tab:h4_utility}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lccc}
\hline
$\lambda$ & $U(g=0)$ & $U(g=1)$ & $\Delta U$ \\
\hline
0.0 & 4.217 $\pm$ 1.498 & 4.325 $\pm$ 1.642 & 0.108 \\
0.5 & 4.197 $\pm$ 1.495 & 4.307 $\pm$ 1.643 & 0.110 \\
1.0 & 4.178 $\pm$ 1.493 & 4.290 $\pm$ 1.644 & 0.112 \\
2.0 & 4.138 $\pm$ 1.489 & 4.254 $\pm$ 1.647 & 0.116 \\
\hline
\end{tabular}
\end{table}

\subsection{Potency--Durability Link}
Table~\ref{tab:h5_metrics} reports predictive metrics for the potency model, indicating modest discrimination. Appendix Figure~\ref{fig:appendix_safety_potency} panel B and Appendix Table~\ref{tab:h5_lot} provide lot-level diagnostics that visualize the weak potency--durability association. Together, these results suggest that release assays alone are insufficient to explain long-term durability in the synthetic study.

\begin{table}[t]
\caption{Potency model performance (mean $\pm$ SD). The AUC and Brier score summarize discrimination and error for the release-metric predictor of durability. The modest AUC indicates limited predictive value of potency measures in the current validation.}
\label{tab:h5_metrics}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lc}
\hline
Metric & Potency model \\
\hline
AUC & 0.560 $\pm$ 0.111 \\
Brier score & 0.208 $\pm$ 0.026 \\
\hline
\end{tabular}
\end{table}

\section{Discussion}
The joint longitudinal--survival framework provides a coherent bridge between mechanistic graft function and clinically meaningful durability endpoints in stem-cell-derived islet therapy. By anchoring analysis to standardized MMTT measures and regulatory-aligned composite endpoints, the model directly addresses the interpretive challenges of small, open-label cohorts \citep{Reichman2025NEJM,FDA2009Guidance}. External standardization provides a defensible approach to historical benchmarking, leveraging donor-islet trials and registry data while explicitly accounting for covariate differences \citep{Hering2016CIT07,Markmann2021CIT06,Barton2012CITR}.

The synthetic validation results highlight both strengths and limitations. Harmonized endpoint modeling reduces between-trial variance and yields stable trial-level efficacy ordering, while the beta-cell composite index improves discrimination without large calibration shifts. In contrast, potency-to-durability prediction remains weak, suggesting that release assays alone may not capture long-term clinical benefit. These insights align with the broader literature emphasizing endpoint harmonization and mechanistic validation while noting remaining gaps in potency validation and long-term safety \citep{Ricordi2016Manufacturing,Reichman2025NEJM}.

\section{Limitations}
The results presented here are derived from synthetic validation experiments rather than finalized clinical datasets. This data gap limits the ability to draw definitive clinical conclusions, particularly for long-term durability and safety outcomes. Weighted Kaplan--Meier variance estimates used in external standardization are naive for inverse-probability weighting; thus the uncertainty bounds in Table~\ref{tab:h1_standardization} should be interpreted cautiously until robust or Monte Carlo variance estimates are implemented. Expected trial artifacts (protocols, statistical analysis plans, endpoint adjudication plans, safety monitoring frameworks, and detailed clinical reports) are not yet available, which limits alignment checks between these synthetic analyses and operational definitions in the clinical program.

\subsection{Future Work}
Future work should prioritize multi-year follow-up to quantify durability beyond 24--36 months, expand evidence on glucocorticoid-free immunosuppression safety in diverse populations, and evaluate comparative effectiveness against modern automated insulin delivery systems. Additional studies should validate manufacturing potency assays as predictors of clinical durability and incorporate robust variance estimation for IPTW survival curves. Providing the missing trial artifacts will enable reconciliation of synthetic endpoints with prespecified adjudication criteria and strengthen the validity of the durability conclusions.

\section{Conclusion}
We present a joint longitudinal--survival framework for evaluating durable engraftment after stem-cell-derived islet replacement in T1D. The model formalizes mechanistic and clinical endpoints, provides theoretical guarantees for survival and standardization estimators, and reports synthetic validation evidence for durability, harmonization, and composite prediction. This framework supports rigorous durability assessment in single-arm cell therapy cohorts and establishes a transparent foundation for future comparative and long-term studies.


\bibliographystyle{conference}
\bibliography{references}

\appendix
\section{Reproducibility and Implementation Details}
Analyses use repeated synthetic runs with fixed seeds $\{11, 23, 37, 51\}$. Sensitivity sweeps include baseline hazard choices (Cox proportional hazards, Weibull, piecewise exponential), C-peptide thresholds (50, 100, 200 pmol/L), endpoint windows (90--365, 90--548, 180--365 days), and weight trimming levels for standardization (none, 0.01, 0.05). Endpoint harmonization sweeps include CGM range targets (70--180 and 54--180 mg/dL), HbA1c thresholds (6.5\% and 7.0\%), and smoothing options for CDF mapping. Uncertainty is summarized with mean $\pm$ SD across seeds, and posterior predictive checks are used to ensure simulated distributions match endpoint targets. The compute budget is consistent with mixed clinical and analytic workflows, and model fitting uses standard scientific computing infrastructure without specialized hardware.

\section{Additional Results}
The appendix provides supplementary diagnostics for subgroup utility and potency analyses, along with calibration summaries that complement the main-text discrimination results. Figure~\ref{fig:appendix_safety_potency} summarizes safety and potency diagnostics that are referenced in the main text.

\begin{figure}[t]
\centering
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h4_frontier.pdf}
\end{minipage}
\hfill
\begin{minipage}{0.48\linewidth}
\centering
\includegraphics[width=\linewidth]{figures/iter_1/h5_potency_calibration.pdf}
\end{minipage}
\caption{Safety and potency diagnostics from synthetic validation. Panel A plots the risk--benefit frontier across penalty parameters $\lambda$, with error bars reflecting cross-seed uncertainty for each subgroup. Panel B shows lot-level potency versus durability with a fitted trend, highlighting the weak association between release metrics and long-term outcomes.}
\label{fig:appendix_safety_potency}
\end{figure}

\subsection{Calibration Summary for the Composite Index}
Table~\ref{tab:h3_calibration} reports calibration bin summaries for the composite index. The predicted and observed success rates are aligned for most bins, indicating that the improved discrimination in Figure~\ref{fig:h3_discrimination} does not materially degrade calibration.

\begin{table}[t]
\caption{Calibration bin summaries for the composite index (mean $\pm$ SD). Each row reports predicted and observed success rates within a probability bin. Deviations between predicted and observed values quantify calibration error across risk strata.}
\label{tab:h3_calibration}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Bin & Mean predicted & Mean observed \\
\hline
0 & 0.249 $\pm$ 0.040 & 0.297 $\pm$ 0.031 \\
1 & 0.384 $\pm$ 0.038 & 0.422 $\pm$ 0.156 \\
2 & 0.487 $\pm$ 0.027 & 0.438 $\pm$ 0.177 \\
3 & 0.567 $\pm$ 0.032 & 0.656 $\pm$ 0.149 \\
4 & 0.703 $\pm$ 0.041 & 0.781 $\pm$ 0.036 \\
\hline
\end{tabular}
\end{table}

\subsection{Lot-Level Potency and Durability}
Table~\ref{tab:h5_lot} summarizes lot-level potency means and durability rates used to generate Appendix Figure~\ref{fig:appendix_safety_potency} panel B. The dispersion across lots emphasizes that potency metrics alone explain only a modest fraction of durability variance in the synthetic study.

\begin{table}[t]
\caption{Lot-level potency and durability summaries (mean $\pm$ SD). The table reports average potency scores and durable rates across seeds for each lot. Variation across lots provides context for the weak potency--durability association in the main text.}
\label{tab:h5_lot}
\centering
\small
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Lot & Potency mean & Durable rate \\
\hline
L1 & 1.034 $\pm$ 0.026 & 0.812 $\pm$ 0.072 \\
L2 & 1.019 $\pm$ 0.079 & 0.688 $\pm$ 0.298 \\
L3 & 0.986 $\pm$ 0.020 & 0.688 $\pm$ 0.161 \\
L4 & 1.082 $\pm$ 0.055 & 0.562 $\pm$ 0.161 \\
L5 & 1.089 $\pm$ 0.109 & 0.781 $\pm$ 0.120 \\
L6 & 1.034 $\pm$ 0.040 & 0.656 $\pm$ 0.157 \\
L7 & 1.018 $\pm$ 0.084 & 0.656 $\pm$ 0.188 \\
L8 & 1.040 $\pm$ 0.069 & 0.719 $\pm$ 0.213 \\
L9 & 1.045 $\pm$ 0.061 & 0.812 $\pm$ 0.125 \\
L10 & 1.091 $\pm$ 0.055 & 0.812 $\pm$ 0.161 \\
\hline
\end{tabular}
\end{table}

\end{document}