\documentclass[11pt,a4paper]{article}

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage{microtype}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath, amssymb, amsthm, amsfonts}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{enumitem}
\usepackage{hyperref}
\usepackage{url}
\usepackage{xcolor}

\hypersetup{
  colorlinks=true,
  linkcolor=black,
  citecolor=blue,
  urlcolor=blue,
  pdftitle={MentisDB: A Hash-Chained Semantic Memory Substrate for Agentic Systems},
  pdfauthor={Angel Leon}
}

\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}

\newcommand{\UUID}{\mathsf{UUID}}
\newcommand{\T}{\mathcal{T}}
\newcommand{\Sstar}{\Sigma^{\star}}
\newcommand{\Hash}{\mathcal{H}}
\newcommand{\TType}{\mathsf{ThoughtType}}
\newcommand{\TRole}{\mathsf{ThoughtRole}}
\newcommand{\TRel}{\mathsf{ThoughtRelationKind}}
\newcommand{\Scope}{\mathsf{Scope}}
\newcommand{\id}{\mathrm{id}}
\newcommand{\idf}{\mathrm{idf}}
\newcommand{\tf}{\mathrm{tf}}
\newcommand{\df}{\mathrm{df}}
\newcommand{\hprev}{h_{\mathrm{prev}}}
\newcommand{\Inv}{\mathcal{I}}
\newcommand{\Reg}{\mathcal{A}}
\newcommand{\Expand}{\mathrm{Expand}}

\title{\textbf{MentisDB}\\[0.3em]
\large A Hash-Chained Semantic Memory Substrate for Agentic Systems}
\author{Angel Leon\\
\small Universidad Cat\'olica Andr\'es Bello, Venezuela\\[0.3em]
\small Version 0.8.9 \quad \textemdash \quad April 17, 2026}
\date{}

\begin{document}
\maketitle

\begin{abstract}
Contemporary agent frameworks treat long-term memory as an afterthought, relying on ad hoc prompt stuffing, unstructured Markdown files, or proprietary session state that is opaque, non-transferable, and easily lost. We introduce \textbf{MentisDB}, a durable, semantically typed memory engine that formalizes agent memory as an append-only, hash-chained ledger of structured \emph{thoughts}.
Formally, a chain is a sequence $\chi = (t_0, t_1, \ldots, t_{n-1})$ of typed records satisfying a cryptographic integrity invariant $t_k.h = H(\sigma(t_k \setminus \{h\}))$ and $t_k.\hprev = t_{k-1}.h$, where $H$ is SHA-256 and $\sigma$ is canonical bincode serialization. On top of $\chi$ we define a retrieval function $R : (\chi, Q) \to \mathcal{P}(\chi)$ that composes BM25 lexical scoring with per-field document-frequency gating, smooth exponential vector--lexical fusion, bidirectional graph expansion over typed relation edges, temporal edge validity predicates, session cohesion, and rank-based fusion via Reciprocal Rank Fusion (RRF). Deduplication is implemented as a Jaccard-similarity test over normalized token sets, emitting $\mathsf{Supersedes}$ edges consulted in constant time via a precomputed invalidation set.
On canonical long-term memory benchmarks, MentisDB attains $R@10 = 88.7\%$ on LoCoMo-2P, $R@10 = 71.9\%$ on LoCoMo-10P, and $R@5 = 66.8\%$ / $R@10 = 72.2\%$ / $R@20 = 78.0\%$ on LongMemEval (v0.8.9, fresh chain, default retrieval settings). Results are deterministic and reproducible across independent runs. The implementation ships as a single Rust crate with an optional daemon exposing MCP, REST, and HTTPS surfaces, requires no external database, and operates without cloud or LLM dependencies in its core ingestion and retrieval path.

\medskip
\noindent\textbf{Keywords:} agent memory, hash-chained ledger, BM25, reciprocal rank fusion, graph expansion, temporal knowledge graphs, retrieval-augmented generation.
\end{abstract}

\section{Introduction}

The proliferation of large-language-model (LLM) agents has exposed a fundamental gap in the systems that support them: the absence of a durable, queryable, and tamper-evident memory substrate. Ephemeral context windows, hand-rolled Markdown files, and provider-specific key--value stores fail to provide the properties that multi-agent coordination demands~---~namely (i) \emph{integrity} under adversarial or accidental mutation, (ii) \emph{semantic typing} to distinguish decisions from observations from corrections, (iii) \emph{temporal validity} to support point-in-time queries, (iv) \emph{hybrid retrieval} combining lexical, semantic, and graph signals, and (v) \emph{portability} across harnesses (Claude Code, Codex, Copilot, Cursor, Qwen, and beyond).

\subsection{Contributions}
This paper makes the following contributions:
\begin{enumerate}[leftmargin=2em]
    \item \textbf{Formal model.} We define agent memory as an append-only, hash-chained ledger of \emph{thoughts}~---~structured, typed, attributable records~---~with a precise integrity invariant and explicit schema evolution semantics (Sections~\ref{sec:model}, \ref{sec:schema}).
    \item \textbf{Semantic typing.} We introduce a 30-variant $\TType$ algebra and an 8-variant $\TRole$ algebra, separating content semantics from workflow mechanics (Section~\ref{sec:semantics}).
    \item \textbf{Temporal edges.} We extend typed graph relations with a validity interval $[\mathtt{valid\_at}, \mathtt{invalid\_at}]$ enabling point-in-time queries via a predicate $\pi_\tau$ (Section~\ref{subsec:temporal}).
    \item \textbf{Hybrid retrieval.} We describe a composable retrieval pipeline~---~per-field BM25 with DF gating, smooth exponential vector--lexical fusion, bounded graph BFS with typed edge weights, session cohesion, importance weighting, and RRF~---~and characterize each signal mathematically (Section~\ref{sec:retrieval}).
    \item \textbf{Deduplication.} We give a Jaccard-threshold algorithm that auto-emits $\mathsf{Supersedes}$ edges, with constant-time consultation via a precomputed invalidation set $\Inv(\chi)$ (Section~\ref{sec:dedup}).
    \item \textbf{Empirical evaluation.} We report results on LoCoMo and LongMemEval, and provide a near-miss analysis characterizing the residual lexical ceiling (Section~\ref{sec:eval}).
\end{enumerate}

\subsection{Paper Organization}
Section~\ref{sec:model} presents the core data model and integrity invariant. Section~\ref{sec:schema} formalizes schema evolution. Section~\ref{sec:semantics} defines the semantic memory algebra. Section~\ref{sec:storage} describes the storage layer. Section~\ref{sec:retrieval} develops the retrieval pipeline. Section~\ref{sec:dedup} gives the deduplication algorithm. Section~\ref{sec:ops} describes the operational surfaces (CLI, MCP). Section~\ref{sec:eval} reports empirical results. Section~\ref{sec:related} compares against related systems. Section~\ref{sec:conclusion} concludes.

\section{System Model and Core Data}\label{sec:model}

\subsection{Thought Record}
\begin{definition}[Thought]\label{def:thought}
Let $\UUID$ denote the set of RFC~4122 universally unique identifiers, $\T$ the set of UTC timestamps, $\Sstar$ the set of finite UTF-8 strings, and $\Hash = \{0,1\}^{256}$ the codomain of SHA-256. A \emph{thought} is a tuple
\[
t = \bigl(v, \id, i, \tau, a, \kappa, \sigma, \varphi, \rho, c, \mathbf{T}, \mathbf{C}, f_{\mathrm{conf}}, f_{\mathrm{imp}}, s, \mathbf{R}, \mathbf{E}, \hprev, h\bigr)
\]
with components listed in Table~\ref{tab:thought-fields}.
\end{definition}

\begin{table}[h]
\centering
\small
\begin{tabular}{@{}lll@{}}
\toprule
Symbol & Domain & Role\\
\midrule
$v$ & $\mathbb{N}$ & schema version (Section~\ref{sec:schema}) \\
$\id$ & $\UUID$ & stable identity \\
$i$ & $\mathbb{N}$ & append-order index \\
$\tau$ & $\T$ & commit timestamp \\
$a$ & $\Sstar$ & agent identifier \\
$\kappa$ & $\Sstar \cup \{\bot\}$ & signing key identifier \\
$\sigma$ & $\{0,1\}^{\star} \cup \{\bot\}$ & Ed25519 signature \\
$\varphi$ & $\TType$ & semantic class \\
$\rho$ & $\TRole$ & workflow role \\
$c$ & $\Sstar$ & content \\
$\mathbf{T}, \mathbf{C}$ & $\mathcal{P}(\Sstar)$ & tags, concepts \\
$f_{\mathrm{conf}}, f_{\mathrm{imp}}$ & $[0,1]$ & confidence, importance \\
$s$ & $\Scope \cup \{\bot\}$ & visibility scope \\
$\mathbf{R}$ & $\mathcal{P}(\mathbb{N})$ & positional back-references \\
$\mathbf{E}$ & $\mathcal{P}(\mathsf{Relation})$ & typed edges \\
$\hprev, h$ & $\Hash$ & chain-integrity hashes \\
\bottomrule
\end{tabular}
\caption{Fields of a thought record.}
\label{tab:thought-fields}
\end{table}

A \emph{thought input} $t^{\circ}$ is the caller-authored subset $(v, a, \varphi, \rho, c, \mathbf{T}, \mathbf{C}, f_{\mathrm{conf}}, f_{\mathrm{imp}}, s, \mathbf{R}, \mathbf{E}^{\circ})$. The chain-managed fields $\{\id, i, \tau, \hprev, h\}$ are assigned on commit; this asymmetry prevents agents from forging chain mechanics.

\subsection{Hash-Chained Ledger}
\begin{definition}[Chain]\label{def:chain}
Let $\sigma : \mathsf{Thought} \to \{0,1\}^{\star}$ denote canonical bincode serialization of $t \setminus \{h\}$. A \emph{chain} is a sequence $\chi = (t_0, t_1, \ldots, t_{n-1})$ satisfying the \emph{integrity invariant}
\[
\forall k : \quad t_k.h = H\bigl(\sigma(t_k)\bigr),
\qquad
\forall k \geq 1 : \quad t_k.\hprev = t_{k-1}.h,
\]
where $H$ is SHA-256. For $t_0$, $t_0.\hprev$ is the empty string.
\end{definition}

\begin{proposition}[Tamper Evidence]\label{prop:tamper}
Let $\chi' = (t'_0, \ldots, t'_{n-1})$ differ from $\chi$ at index $j$, i.e.\ $t'_j \neq t_j$, with all other components unchanged. Then either $t'_j.h \neq H(\sigma(t'_j))$ (local inconsistency detectable at index $j$) or $t'_{j+1}.\hprev \neq t'_j.h$ (cascading inconsistency detectable at index $j+1$). Consequently, forging a modification requires recomputing every subsequent hash, touching $n - j$ records.
\end{proposition}
\begin{proof}[Proof sketch]
By collision resistance of $H$, $t'_j \neq t_j \Rightarrow H(\sigma(t'_j)) \neq H(\sigma(t_j))$ with overwhelming probability. Either the adversary preserves $t'_j.h = H(\sigma(t'_j))$ (then $t'_j.h \neq t_j.h$, breaking $t'_{j+1}.\hprev$), or leaves $t'_j.h = t_j.h$ (then $t'_j.h \neq H(\sigma(t'_j))$, so the local check fails).
\end{proof}

This is a practical integrity mechanism for agent memory; it does not imply a consensus protocol or distributed-ledger guarantees. Optional Ed25519 signatures $(\kappa, \sigma)$ are layered on individual thoughts for stronger provenance when the producing agent has registered a public key in the agent registry.

\subsection{Typed Relation Edges}
\begin{definition}[Relation]\label{def:relation}
A relation is a tuple $e = (\kappa, \id^{\ast}, \chi^{\ast}, \mathtt{v}_{\ast}, \mathtt{v}^{\ast})$ where $\kappa \in \TRel$, $\id^{\ast} \in \UUID$ is the target, $\chi^{\ast} \in \Sstar \cup \{\bot\}$ is an optional cross-chain key, and $\mathtt{v}_{\ast}, \mathtt{v}^{\ast} \in \T \cup \{\bot\}$ bound the edge's validity interval.
\end{definition}

The adjacency structure $A(\chi)$ induced by $\chi$ is a directed multigraph whose nodes are thought locators and whose edges derive from $\mathbf{R}$ (positional refs) and $\mathbf{E}$ (typed relations). We denote outgoing and incoming neighborhoods by $N^{+}(v)$ and $N^{-}(v)$ respectively; bidirectional expansion considers $N^{+}(v) \cup N^{-}(v)$.

\subsection{Agent Registry}
To avoid duplicating identity metadata inside every record, MentisDB maintains a per-chain registry $\Reg(\chi)$ mapping $a \mapsto (\mathtt{display\_name}, \mathtt{owner}, \mathtt{description}, \mathtt{aliases}, \mathtt{status}, \mathtt{public\_keys}, \mathtt{counters})$. Thoughts carry only the stable $a$; the registry is resolved at read time. The registry is itself administrable through library calls, MCP tools, and REST endpoints, permitting pre-registration, documentation, revocation, or key rotation prior to any appended thought.

\section{Schema Evolution}\label{sec:schema}

\subsection{Version Lattice}
MentisDB exposes a linearly ordered schema version space $\mathcal{V} = \{V_0, V_1, V_2, V_3\}$:

\begin{table}[h]
\centering\small
\begin{tabular}{@{}ll@{}}
\toprule
Version & Additions relative to predecessor \\
\midrule
$V_0$ & original format; no explicit $v$ field \\
$V_1$ & explicit $v$, optional $(\kappa, \sigma)$, agent registry sidecar \\
$V_2$ & $\varphi := \varphi \cup \{\mathsf{Reframe}\}$, $\TRel := \TRel \cup \{\mathsf{Supersedes}\}$, optional cross-chain $\chi^{\ast}$ \\
$V_3$ & edge validity fields $\mathtt{valid\_at}, \mathtt{invalid\_at}$ \\
\bottomrule
\end{tabular}
\caption{Schema versions. The current constant is $V_{\mathrm{cur}} = V_3$.}
\end{table}

\subsection{Migration as Idempotent Transformation}
Each migration $\mu_{V_k \to V_{k+1}} : \chi_{V_k} \to \chi_{V_{k+1}}$ satisfies
\[
\mu_{V_k \to V_{k+1}} \circ \mu_{V_k \to V_{k+1}} = \mu_{V_k \to V_{k+1}} \quad \text{(idempotence)}
\]
and is composable: $\chi_{V_0}$ is upgraded via $\mu_{V_2 \to V_3} \circ \mu_{V_1 \to V_2} \circ \mu_{V_0 \to V_1}$. Because bincode encodes enum variants by integer tag, new variants are appended to the end of an enum to preserve binary compatibility; mid-enum insertion would violate the injectivity of the serialization map. After each migration the hash chain is rebuilt under $V_{\mathrm{cur}}$ and persisted in native format, so subsequent opens incur no migration cost.

\subsection{Version Detection}
Version is inferred by peeking at the first record's $v$ field. A subtle edge case arises for $V_0$ chains that lack the field entirely: the bincode ``empty-Vec fast path'' for $V_0$ reads $v = 0$, which is disambiguated from a non-empty $V_0$ chain by the residual byte-length check. Practically, this provides reliable version detection in $O(1)$ regardless of chain length.

\section{Semantic Memory Algebra}\label{sec:semantics}

\subsection{ThoughtType}
$\TType$ partitions the semantic space into 30 disjoint classes across seven categories:

\begin{table}[h]
\centering\small
\begin{tabular}{@{}ll@{}}
\toprule
Category & Variants \\
\midrule
User / relationship & $\mathsf{PreferenceUpdate}, \mathsf{UserTrait}, \mathsf{RelationshipUpdate}$ \\
Observation & $\mathsf{Finding}, \mathsf{Insight}, \mathsf{FactLearned}, \mathsf{PatternDetected}, \mathsf{Hypothesis}, \mathsf{Surprise}$ \\
Error / correction & $\mathsf{Mistake}, \mathsf{Correction}, \mathsf{LessonLearned}, \mathsf{AssumptionInvalidated}, \mathsf{Reframe}$ \\
Planning & $\mathsf{Constraint}, \mathsf{Plan}, \mathsf{Subgoal}, \mathsf{Goal}, \mathsf{Decision}, \mathsf{StrategyShift}$ \\
Exploration & $\mathsf{Wonder}, \mathsf{Question}, \mathsf{Idea}, \mathsf{Experiment}$ \\
Execution & $\mathsf{ActionTaken}, \mathsf{TaskComplete}$ \\
State & $\mathsf{Checkpoint}, \mathsf{StateSnapshot}, \mathsf{Handoff}, \mathsf{Summary}$ \\
\bottomrule
\end{tabular}
\end{table}

\subsection{ThoughtRole}
$\TRole$ is orthogonal to $\TType$, specifying \emph{how} the system uses a memory:
\[
\TRole = \{\mathsf{Memory}, \mathsf{WorkingMemory}, \mathsf{Summary}, \mathsf{Compression}, \mathsf{Checkpoint}, \mathsf{Handoff}, \mathsf{Audit}, \mathsf{Retrospective}\}.
\]
The product $\TType \times \TRole$ yields 240 distinguishable semantic positions. A retrospective lesson is encoded as $(\mathsf{LessonLearned}, \mathsf{Retrospective})$.

\subsection{ThoughtRelationKind}
\[
\TRel = \{\mathsf{References}, \mathsf{Summarizes}, \mathsf{Corrects}, \mathsf{Invalidates}, \mathsf{CausedBy},
\]
\[
\mathsf{Supports}, \mathsf{Contradicts}, \mathsf{DerivedFrom}, \mathsf{ContinuesFrom}, \mathsf{BranchesFrom}, \mathsf{RelatedTo}, \mathsf{Supersedes}\}.
\]

$\mathsf{Supersedes}$ is distinguished from $\mathsf{Corrects}$: the former replaces a prior framing without asserting an error, while the latter asserts a factual correction.

\subsection{Temporal Edge Validity}\label{subsec:temporal}
\begin{definition}[As-Of Predicate]\label{def:asof}
For a relation $e$ with validity interval $[\mathtt{v}_{\ast}, \mathtt{v}^{\ast}]$ (treating $\bot$ on either bound as $-\infty$ and $+\infty$ respectively),
\[
\pi_\tau(e) \equiv (\mathtt{v}_{\ast} = \bot \lor \mathtt{v}_{\ast} \leq \tau) \land (\mathtt{v}^{\ast} = \bot \lor \tau < \mathtt{v}^{\ast}).
\]
\end{definition}
Graph expansion restricted by $\tau$ considers only edges satisfying $\pi_\tau$. Combined with the append-ordering invariant $i(t) < n$, this yields point-in-time retrieval: ``what did the agent know at $\tau$?''

\subsection{Invalidation Set}
\begin{definition}[Invalidation Set]\label{def:inv}
Given $\chi$,
\[
\Inv(\chi) = \bigl\{ e.\id^{\ast} : e \in \textstyle\bigcup_{t \in \chi} \mathbf{E}(t),\; \kappa(e) \in \{\mathsf{Supersedes}, \mathsf{Corrects}, \mathsf{Invalidates}\} \bigr\}.
\]
\end{definition}
$\Inv(\chi)$ is precomputed at chain open time as a $\mathsf{HashSet}\langle\UUID\rangle$, enabling $O(1)$ superseded-thought detection during retrieval.

\section{Storage Layer}\label{sec:storage}

\subsection{Storage Adapter Abstraction}
The trait $\mathsf{StorageAdapter}$ abstracts persistence with four essential operations: \texttt{load\_thoughts}, \texttt{append\_thought}, \texttt{flush}, \texttt{set\_auto\_flush}. This allows the chain semantics to remain invariant under backend substitution.

\subsection{BinaryStorageAdapter}
The default backend serializes each thought as a length-prefixed bincode record:
\[
\underbrace{\ell_0}_{\text{4-byte LE}}\underbrace{\sigma(t_0)}_{\ell_0\text{ bytes}} \parallel \underbrace{\ell_1}_{\text{4-byte LE}}\underbrace{\sigma(t_1)}_{\ell_1\text{ bytes}} \parallel \cdots
\]
with file extension \texttt{.tcbin}.

Two durability modes are supported:
\begin{itemize}[leftmargin=2em]
    \item \textbf{Strict} ($\mathtt{auto\_flush} = \mathrm{true}$): appends are routed through a dedicated writer thread; callers block until the flush acknowledgment returns. A group-commit window of $\Delta_{\mathrm{gc}}$ (default 2\,ms) amortizes flush cost across concurrent writers.
    \item \textbf{Buffered} ($\mathtt{auto\_flush} = \mathrm{false}$): records are queued and batched; the writer flushes every $\Phi = 16$ records. Up to $\Phi - 1 = 15$ records may be lost on a hard crash, with a corresponding throughput gain.
\end{itemize}

\subsection{Legacy Adapters}
\texttt{LegacyJsonlReadAdapter} is a read-only compatibility shim for migrating $V_0$ \texttt{.jsonl} chains; it cannot be used for new writes.

\subsection{File Layout}
\begin{verbatim}
~/.mentisdb/
  mentisdb-registry.json
  mentisdb-skills.bin
  <chain-key>.tcbin
  <chain-key>.agents.json
  <chain-key>.vectors.bin
  tls/{cert.pem, key.pem}
\end{verbatim}

\section{Retrieval}\label{sec:retrieval}

Retrieval separates a deterministic filter-first baseline from a scored ranked pipeline.

\subsection{Baseline Filter}
The baseline $R_{\mathrm{base}}(\chi, Q)$ narrows candidates by indexed fields $(\varphi, \rho, a, \mathbf{T}, \mathbf{C})$ and applies a case-insensitive substring predicate over $(c, \mathtt{agent\_meta}, \mathbf{T}, \mathbf{C})$. Results return in append order. This path is explainable and has no BM25, vector, or graph component.

\subsection{Ranked Pipeline}
Ranked search $R_{\mathrm{rank}}$ selects a backend based on query features:
\begin{table}[h]
\centering\small
\begin{tabular}{@{}ll@{}}
\toprule
Query features & Backend \\
\midrule
non-empty text, no vector sidecar & $\mathsf{Lexical}$ \\
non-empty text, vector sidecar & $\mathsf{Hybrid}$ \\
non-empty text, graph enabled, no vector & $\mathsf{LexicalGraph}$ \\
non-empty text, graph enabled, vector sidecar & $\mathsf{HybridGraph}$ \\
empty or absent text & $\mathsf{Heuristic}$ \\
\bottomrule
\end{tabular}
\end{table}

\subsection{BM25 with Per-Field DF Gating}
\begin{definition}[BM25 Field Score]\label{def:bm25}
Let $N = |\chi|$ be corpus size, $\df(q)$ the document frequency of term $q$, and for field $f \in \{\mathrm{content}, \mathrm{tags}, \mathrm{concepts}, \mathrm{agent\_id}, \mathrm{agent\_registry}\}$ let $\tf_f(d,q)$ and $|d|_f$ denote term frequency and field length respectively, with $\overline{|d|_f}$ the corpus mean. With $k_1 = 1.2$, $b = 0.75$:
\begin{align}
\idf(q) &= \ln\!\left(\frac{N - \df(q) + 0.5}{\df(q) + 0.5} + 1\right), \\
\mathrm{score}_f(d, q) &= \idf(q) \cdot \frac{\tf_f(d, q)\,(k_1 + 1)}{\tf_f(d, q) + k_1\!\left(1 - b + b\,\dfrac{|d|_f}{\overline{|d|_f}}\right)}.
\end{align}
\end{definition}

\begin{definition}[DF Gate]\label{def:dfgate}
For per-field cutoff $\tau_f \in [0, 1]$,
\[
\gamma_f(q) = \mathbb{1}\!\left[\frac{\df(q)}{N} \leq \tau_f \;\lor\; N < 20\right].
\]
Defaults: $\tau_{\mathrm{content}} = \tau_{\mathrm{tags}} = \tau_{\mathrm{concepts}} = 0.30$, $\tau_{\mathrm{agent\_registry}} = 0.60$, $\tau_{\mathrm{agent\_id}} = 0.70$.
\end{definition}

\begin{definition}[Lexical Score]\label{def:lex}
With per-field weights $w_f$ (defaults $w_{\mathrm{content}} = 1.0$, $w_{\mathrm{tags}} = 1.6$, $w_{\mathrm{concepts}} = 1.4$, $w_{\mathrm{agent\_id}} = 1.5$, $w_{\mathrm{agent\_registry}} = 1.1$):
\[
S_\ell(d, Q) = \sum_{q \in Q} \sum_{f} \gamma_f(q) \cdot w_f \cdot \mathrm{score}_f(d, q).
\]
\end{definition}

A term violating $\gamma_f$ in one field may still contribute through other fields whose cutoffs it respects. The 20-document threshold suppresses DF filtering on small corpora where statistics are not yet meaningful.

Normalization applies Porter stemming~\cite{porter1980} before indexing and querying. An irregular-verb lemma table of approximately 170 entries expands query-time tokens (e.g.\ \texttt{went}~$\to$~\texttt{go}, \texttt{saw}~$\to$~\texttt{see}), because Porter stemming cannot normalize suppletive forms.

\subsection{Smooth Vector--Lexical Fusion}
When a managed vector sidecar provides cosine similarity $s_v(d, Q) \in [-1, 1]$ (e.g.\ via ONNX-embedded \texttt{fastembed-minilm}), the hybrid contribution is
\begin{equation}\label{eq:fuse}
S_{\mathrm{fuse}}(d, Q) = s_v(d, Q) \cdot \Bigl(1 + \alpha \exp\!\bigl(-S_\ell(d, Q) / \beta\bigr)\Bigr),
\end{equation}
with $\alpha = 35$ and $\beta = 3$. This yields $\sim\!36\times$ amplification for pure-semantic matches ($S_\ell = 0$), decays to $\sim\!12\times$ at $S_\ell = 3$, and approaches additive composition for $S_\ell \geq 6$. The smooth exponential eliminates the discontinuities that step-function boost tiers introduce at bin boundaries.

\subsection{Graph-Aware Expansion}
\begin{definition}[Bounded BFS Expansion]\label{def:expand}
Given seed set $\Sigma_0 \subseteq \chi$ with $|\Sigma_0| \leq 20$, adjacency index $A(\chi)$, and traversal mode $M \in \{\mathsf{Out}, \mathsf{In}, \mathsf{Bi}\}$,
\[
\Expand_{d_{\max}, V_{\max}, M}(\Sigma_0) = \bigl\{ (v, d, \pi) : v \in \chi,\; d \leq d_{\max},\; \mathrm{path}(\pi) \subseteq \chi \bigr\},
\]
bounded by maximum depth $d_{\max}$ and visit budget $V_{\max}$, with edges optionally filtered by $\pi_\tau$ (Definition~\ref{def:asof}).
\end{definition}

\paragraph{Edge weights.} For traversals along a typed relation of kind $\kappa$, the edge contributes $b_{\mathrm{rel}}(\kappa)$:
\begin{table}[h]
\centering\small
\begin{tabular}{@{}lclc@{}}
\toprule
$\kappa$ & $b_{\mathrm{rel}}$ & $\kappa$ & $b_{\mathrm{rel}}$ \\
\midrule
$\mathsf{ContinuesFrom}$ & 0.60 & $\mathsf{Summarizes}$ & 0.20 \\
$\mathsf{BranchesFrom}$ & 0.55 & $\mathsf{CausedBy}$ & 0.20 \\
$\mathsf{Corrects}, \mathsf{Invalidates}$ & 0.50 & $\mathsf{Supports}, \mathsf{Contradicts}$ & 0.15 \\
$\mathsf{Supersedes}$ & 0.45 & $\mathsf{RelatedTo}$ & 0.08 \\
$\mathsf{DerivedFrom}$ & 0.40 & $\mathsf{References}$ & 0.06 \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Graph proximity.} For a hit at depth $d \geq 1$, $S_{\mathrm{graph}}(d) = 1/d$.

\subsection{Session Cohesion}
\begin{definition}[Session Cohesion Boost]\label{def:coh}
For a seed $\sigma \in \Sigma_0$ with $S_\ell(\sigma) \in [\theta_{\mathrm{seed}}, \theta_{\mathrm{solo}}) = [3, 5)$, and a candidate $d$ with $|i(d) - i(\sigma)| \leq 8$,
\[
S_{\mathrm{coh}}(d) = \max_{\sigma \in \Sigma_0} \max\!\Bigl(0,\; 0.8 \cdot \bigl(1 - |i(d) - i(\sigma)|/8\bigr) \cdot \mathbb{1}[\theta_{\mathrm{seed}} \leq S_\ell(\sigma) < \theta_{\mathrm{solo}}]\Bigr).
\]
\end{definition}
Seeds above $\theta_{\mathrm{solo}}$ are excluded because they are strong enough to stand on their own; the cohesion boost is meant to surface \emph{evidence turns} adjacent to a match that share no direct lexical terms.

\subsection{Importance Weighting}
\begin{definition}[Importance Boost]\label{def:imp}
\[
S_{\mathrm{imp}}(d, Q) = S_\ell(d, Q) \cdot (f_{\mathrm{imp}}(d) - 0.5) \cdot 0.3.
\]
\end{definition}
User-originated thoughts ($f_{\mathrm{imp}} \approx 0.8$) outrank verbose assistant responses ($f_{\mathrm{imp}} \approx 0.2$) in close BM25 races. The differential structure prevents flat multipliers from overwhelming lexical signal.

\subsection{Reciprocal Rank Fusion}
When $\mathtt{enable\_reranking}$ is set, the top $K = \mathtt{rerank\_k}$ candidates (default 50) are reranked via RRF~\cite{cormack2009rrf}.

\begin{definition}[Reciprocal Rank Fusion]\label{def:rrf}
Given $m$ ranked lists $L_1, \ldots, L_m$ of candidate documents, with $\mathrm{rank}_i(d)$ the 1-indexed position of $d$ in $L_i$ (or $+\infty$ if absent), and damping constant $k = 60$,
\begin{equation}\label{eq:rrf}
S_{\mathrm{RRF}}(d) = \sum_{i=1}^{m} \frac{1}{k + \mathrm{rank}_i(d)}.
\end{equation}
\end{definition}

MentisDB produces three single-signal rankings~---~lexical-only ($S_\ell$), vector-only ($s_v$), and graph-only ($S_{\mathrm{graph}} + b_{\mathrm{rel}} + S_{\mathrm{seed}}$)~---~and fuses them via $S_{\mathrm{RRF}}$. The RRF total replaces the additive blend. Non-rankable signals ($S_{\mathrm{imp}}, S_{\mathrm{coh}}, f_{\mathrm{conf}}$, recency) are then added as small tie-breaking adjustments. RRF is pure arithmetic: no LLM, no external service, no network round trip.

\subsection{Memory Scopes}
$\Scope = \{\mathsf{User}, \mathsf{Session}, \mathsf{Agent}\}$, stored as tag markers \texttt{scope:\{variant\}}. A query with scope $s$ filters hits such that $s(d) = s$; absence of a scope filter returns all scopes.

\subsection{Context Bundles}
$\mathrm{Bundle}(\chi, Q) = \{(\sigma, N^{\pm}(\sigma) \cap R_{\mathrm{rank}}(\chi, Q)) : \sigma \in \Sigma_0\}$: each bundle pairs a lexical seed with its graph-expanded neighbors, presented in deterministic provenance order so agents can interpret \emph{why} supporting thoughts surfaced.

\subsection{Vector Sidecars}
Vector state lives in rebuildable per-chain sidecars, partitioned by $(\chi, \id, h, \mathrm{model\_id}, \dim, \mathrm{version})$. Model or version changes invalidate old sidecars rather than silently mixing incompatible embeddings. Managed sidecars remain synchronized on append; the daemon defaults to local ONNX inference via \texttt{fastembed-minilm}.

\subsection{Decomposed Scores}
Each ranked hit exposes a score vector
\[
\mathbf{s}(d) = (S_\ell, s_v, S_{\mathrm{graph}}, b_{\mathrm{rel}}, S_{\mathrm{seed}}, S_{\mathrm{imp}}, f_{\mathrm{conf}}, S_{\mathrm{rec}}, S_{\mathrm{coh}}, S_{\mathrm{tot}})
\]
together with $\mathrm{matched\_terms}$ and $\mathrm{match\_sources}$, preserving auditability.

\section{Deduplication}\label{sec:dedup}

\subsection{Jaccard--Supersedes Algorithm}
Let $\mathcal{N}(t) \subseteq \Sstar$ denote the normalized token set of thought $t$ (Porter-stemmed, lemma-expanded). Given threshold $\theta \in [0, 1]$ and scan window $w \in \mathbb{N}$, on append of $t_n$ with $\mathcal{N}(t_n) \neq \emptyset$, MentisDB computes
\[
t^{\ast} = \arg\max_{t \in \{t_{n-w}, \ldots, t_{n-1}\}} J\bigl(\mathcal{N}(t_n), \mathcal{N}(t)\bigr),
\qquad
J(A, B) = \frac{|A \cap B|}{|A \cup B|}.
\]
If $J(\mathcal{N}(t_n), \mathcal{N}(t^{\ast})) \geq \theta$, it auto-emits a relation $e = (\mathsf{Supersedes}, t^{\ast}.\id, \bot, \tau_{\mathrm{now}}, \bot)$ on $t_n$, and updates $\Inv(\chi)$ to include $t^{\ast}.\id$ for $O(1)$ skipping in subsequent retrieval.

\paragraph{Complexity.} Token normalization is $O(|c|)$ per record. Jaccard over the window is $O(w \cdot \overline{|\mathcal{N}|})$. With default $w = 64$ and typical tokens per thought $\leq 200$, dedup cost is bounded by a constant factor of append cost.

\subsection{Configuration}
\begin{verbatim}
MENTISDB_DEDUP_THRESHOLD   = theta   in [0, 1]
MENTISDB_DEDUP_SCAN_WINDOW = w       in N (default 64)
\end{verbatim}
The superseded thought is retained for audit; no content is deleted. Ranked search deprioritizes it via $\Inv(\chi)$.

\section{Operational Surfaces}\label{sec:ops}

\subsection{CLI}
The daemon \texttt{mentisdbd} exposes three subcommands over REST at \texttt{http://127.0.0.1:9472}: \texttt{add}, \texttt{search}, \texttt{agents}. Synchronous HTTP (\texttt{ureq}) avoids pulling an async runtime into the client path.

\subsection{MCP Server}
\texttt{mentisdbd} exposes a streamable HTTP MCP endpoint at \texttt{POST /} (port 9471) with 35 tools covering bootstrap, append, search, read, export/import, agent registry, chain management, and a skill registry. Legacy REST endpoints \texttt{POST /tools/list} and \texttt{POST /tools/execute} remain available for compatibility.

\subsection{Skill Registry}
The skill registry is a git-like immutable version store for agent instruction bundles. An upload to an existing $\mathtt{skill\_id}$ creates a new immutable version: the first is stored as full content, subsequent versions as unified diff patches. Version reconstruction replays patches from $v_0$ forward. Content hashes are computed over reconstructed content, decoupling integrity from storage representation. Agents with registered Ed25519 keys must cryptographically sign uploads; signature verification is server-side before acceptance.

\subsection{Bootstrap Protocol}
Modern MCP clients bootstrap from the handshake:
\begin{enumerate}[leftmargin=2em]
    \item $\mathtt{initialize.instructions}$ directs the agent to read $\mathtt{mentisdb://skill/core}$.
    \item $\mathtt{resources/read}$ returns the embedded operating skill.
    \item $\mathtt{mentisdb\_bootstrap}$ opens or creates the chain; if empty, writes a genesis checkpoint.
    \item $\mathtt{mentisdb\_recent\_context}$ loads prior state.
\end{enumerate}

\section{Empirical Evaluation}\label{sec:eval}

\subsection{Benchmarks}
We evaluate MentisDB on two standard long-term memory benchmarks.

\begin{table}[h]
\centering\small
\begin{tabular}{@{}llccc@{}}
\toprule
Benchmark & Metric & v0.8.1 & v0.8.5 & v0.8.9 \\
\midrule
LoCoMo-2P & $R@10$ & \textbf{88.7\%} & -- & -- \\
LoCoMo-2P single-hop & $R@10$ & 90.7\% & -- & -- \\
LoCoMo-10P (1977 queries) & $R@10$ & 74.2\% & \textbf{74.6\%} & \textbf{71.9\%} \\
LoCoMo-10P single-hop & $R@10$ & -- & 79.0\% & 75.8\% \\
LoCoMo-10P multi-hop & $R@10$ & -- & 58.4\% & 57.4\% \\
LoCoMo-10P & $R@20$ & -- & -- & 79.1\% \\
LongMemEval (fresh chain) & $R@5$ & 67.6\% & -- & \textbf{66.8\%} \\
LongMemEval (fresh chain) & $R@10$ & 73.2\% & -- & \textbf{72.2\%} \\
LongMemEval (fresh chain) & $R@20$ & -- & -- & 78.0\% \\
\bottomrule
\end{tabular}
\caption{Retrieval quality across MentisDB releases. All v0.8.9 numbers are the mean of independent full-scale runs on 2026-04-14 and 2026-04-17 against fresh chains with default retrieval configuration (\texttt{fastembed-minilm} vector sidecar, graph expansion enabled, RRF reranking disabled); the scores were bit-identical across runs.}
\end{table}

The v0.8.5 LoCoMo-10P improvement derives from three changes:
\begin{enumerate}[leftmargin=2em]
    \item Session cohesion tuning: radius $8 \to 12$, boost $0.8 \to 1.2$.
    \item Doubled edge weights $b_{\mathrm{rel}}$ across all $\TRel$ variants.
    \item FastEmbed MiniLM sentence embeddings replacing text-only hashing when the \texttt{local-embeddings} feature is compiled.
\end{enumerate}

\subsection{Scoring Evolution}
\begin{table}[h]
\centering\small
\begin{tabular}{@{}llcc@{}}
\toprule
Version & Change & LME $R@5$ & LoCoMo-10P $R@10$ \\
\midrule
0.8.0 baseline & -- & 57.2\% & -- \\
0.8.0 + Porter stemming & token normalization & 61.6\% & -- \\
0.8.0 + tiered fusion + importance & vector/lexical balance & 65.0\% & -- \\
0.8.1 + cohesion + smooth fusion + DF cutoff & retrieval quality & 67.6\% & 74.2\% \\
0.8.5 + cohesion tuning + $b_{\mathrm{rel}} \times 2$ + fastembed & session/graph boost & -- & 74.6\% \\
0.8.9 + irregular lemmas + webhooks & lemma expansion + events & 66.8\% & 71.9\% \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Near-Miss Analysis (LoCoMo-10P, v0.8.5)}
Of 503 misses (gold answer absent from top-10):
\begin{table}[h]
\centering\small
\begin{tabular}{@{}lccl@{}}
\toprule
Bucket & Count & Fraction & Interpretation \\
\midrule
$R@20$ hit & 130 & 25.8\% & close ranking error \\
$R@50$ hit & 285 & 56.7\% & moderate signal gap \\
$R > 50$ & 218 & 43.3\% & lexical gap (query terms absent from evidence) \\
\bottomrule
\end{tabular}
\end{table}

The 43.3\% figure represents a hard ceiling for BM25-only retrieval on this benchmark. Closing it requires larger embedding models, LLM-driven query expansion, or external knowledge retrieval~---~mechanisms orthogonal to the scoring pipeline formalized here.

\subsection{Micro-Benchmarks}
Criterion micro-benchmarks span five domains: append throughput (\texttt{thought\_chain}), baseline search (\texttt{search\_baseline}), ranked retrieval (\texttt{search\_ranked}), skill registry lifecycle (\texttt{skill\_registry}), and HTTP concurrency at $\{100, 10^{3}, 10^{4}\}$ concurrent Tokio tasks with $p_{50}/p_{95}/p_{99}$ reporting (\texttt{http\_concurrency}). A \texttt{DashMap}-based concurrent chain lookup delivers 750--930 read req/s at $10^{4}$ concurrent tasks, versus the previous $\mathsf{RwLock}\langle\mathsf{HashMap}\rangle$ bottleneck.

\section{Related Work and Positioning}\label{sec:related}

\begin{table}[h]
\centering\small
\begin{tabular}{@{}lcccc@{}}
\toprule
Feature & \textbf{MentisDB} & Mem0 & Graphiti / Zep & Letta / MemGPT \\
\midrule
Language & Rust & Python & Python & Python / TS \\
Storage & embedded file & external DB & Neo4j / FalkorDB & external DB \\
LLM required for core & \textbf{No} & Yes & Yes & Yes \\
Cryptographic integrity & \textbf{SHA-256 chain} & -- & -- & -- \\
Hybrid retrieval & BM25+vec+graph & vec+keyword & sem+kw+graph & -- \\
Temporal facts & $[\mathtt{v}_{\ast}, \mathtt{v}^{\ast}]$ & update-only & $[\mathtt{v}_{\ast}, \mathtt{v}^{\ast}]$ & -- \\
Deduplication & Jaccard+$\mathsf{Supersedes}$ & LLM-based & merge & -- \\
Agent registry & Yes & -- & -- & Yes \\
MCP server & Built-in & -- & Yes & -- \\
\bottomrule
\end{tabular}
\end{table}

MentisDB is, to our knowledge, the only system combining (i) embedded storage, (ii) zero LLM dependency in the core path, (iii) cryptographic chain integrity, and (iv) hybrid BM25 + vector + graph retrieval in a single static binary. Identified gaps relative to competitors: custom per-chain entity/relation ontologies (as in Graphiti's Pydantic models), LLM-driven memory extraction, a browser extension, and per-thought token accounting.

\section{Discussion, Limitations, and Conclusion}\label{sec:conclusion}

\subsection{Limitations}
\begin{itemize}[leftmargin=2em]
    \item \textbf{Ceiling of sparse retrieval.} The near-miss analysis quantifies an irreducible $43\%$ lexical gap on LoCoMo-10P for BM25-only retrieval. Dense embeddings mitigate but do not eliminate this.
    \item \textbf{Local-only integrity model.} The hash chain provides tamper evidence, not Byzantine fault tolerance or distributed consensus; cross-chain consistency is not enforced cryptographically.
    \item \textbf{Schema churn discipline.} Because bincode tags enum variants by ordinal, schema evolution is append-only at the enum level~---~reordering or renaming variants would silently corrupt persisted data.
\end{itemize}

\subsection{Future Work}
\begin{itemize}[leftmargin=2em]
    \item Per-chain entity/relation ontologies enabling typed domain-specific facts beyond the fixed $\TRel$.
    \item Episode provenance: tracing derived facts back to source conversations.
    \item Cross-chain federated retrieval with result reconciliation across distributed ledgers.
    \item Optional LLM-extracted memories as a layered, auditable transform.
    \item Self-improving skill registry: agents committing updated skill versions as they learn, with signed provenance.
\end{itemize}

\subsection{Conclusion}
MentisDB formalizes agent memory as an append-only, hash-chained ledger of semantically typed thoughts, and couples that ledger with a composable retrieval pipeline~---~BM25 with per-field DF gating, smooth vector--lexical fusion, bounded typed-edge graph expansion, RRF, session cohesion, and Jaccard-based deduplication~---~in a single embedded Rust substrate. Empirical results on LoCoMo and LongMemEval demonstrate competitive retrieval quality without reliance on external databases or LLM services for the core ingestion path. The system is released as open source and exposes MCP, REST, and HTTPS surfaces for interoperation with contemporary agentic harnesses.

\begin{thebibliography}{9}

\bibitem{robertson2009bm25}
S.~Robertson and H.~Zaragoza.
\emph{The Probabilistic Relevance Framework: BM25 and Beyond}.
Foundations and Trends in Information Retrieval, 3(4), 2009.

\bibitem{cormack2009rrf}
G.~V.~Cormack, C.~L.~A.~Clarke, and S.~B\"uttcher.
\emph{Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods}.
In Proc.\ SIGIR, 2009.

\bibitem{porter1980}
M.~F.~Porter.
\emph{An Algorithm for Suffix Stripping}.
Program, 14(3), 1980.

\bibitem{jaccard1901}
P.~Jaccard.
\emph{\'Etude comparative de la distribution florale dans une portion des Alpes et des Jura}.
Bulletin de la Soci\'et\'e Vaudoise des Sciences Naturelles, 1901.

\bibitem{bernstein2012ed25519}
D.~J.~Bernstein, N.~Duif, T.~Lange, P.~Schwabe, and B.-Y.~Yang.
\emph{High-Speed High-Security Signatures}.
Journal of Cryptographic Engineering, 2012.

\bibitem{fips180-4}
NIST FIPS PUB 180-4.
\emph{Secure Hash Standard (SHS)}, 2015.

\bibitem{maharana2024locomo}
A.~Maharana et al.
\emph{Evaluating Very Long-Term Conversational Memory of LLM Agents (LoCoMo)}, 2024.

\bibitem{wu2024longmemeval}
D.~Wu et al.
\emph{LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory}, 2024.

\bibitem{anthropic2024mcp}
Anthropic.
\emph{Model Context Protocol (MCP) Specification}, 2024.

\end{thebibliography}

\end{document}