/** * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ export const TOKEN_CHARACTER = "§"; const ALLOWED_TOKEN_STARTS = [ "search:", "existing_memory:", "followup:", "url_token:", "kit:", ]; const MAX_START_LEN = Math.max( ...ALLOWED_TOKEN_STARTS.map(string => string.length) ); // Keep a tail of recently emitted plain text so that when a URL token // arrives we can peek backwards to tell whether the token is inside a // markdown link `[click](URL)` or sitting on its own. 16 chars is // enough even with whitespace between `](` and the token. const URL_CONTEXT_LOOKBACK_CHARS = 16; // Matches when the recent emitted text ends in `](` (with optional // whitespace) — meaning the URL portion of a markdown link comes next. const AT_MARKDOWN_LINK_URL_RE = /\]\(\s*$/; // `encodeURIComponent` leaves `(` and `)` alone because they are // "sub-delimiters" per RFC 3986, not characters it needs to escape. // They are valid in URLs, but they collide with markdown's // `[text](dest)` delimiters, which is what we're working around here. const ENCODE_FOR_LINK_OVERRIDES = { "(": "%28", ")": "%29" }; function encodeForLink(c) { return ENCODE_FOR_LINK_OVERRIDES[c] ?? encodeURIComponent(c); } function isAllowedPrefix(string) { return ALLOWED_TOKEN_STARTS.some(start => start.startsWith(string)); } function isExactAllowedStart(string) { return ALLOWED_TOKEN_STARTS.includes(string); } function pushPlain(plain, { state, str } = {}) { plain.push(str); state.recentPlain = (state.recentPlain + str).slice( -URL_CONTEXT_LOOKBACK_CHARS ); } function hasUnbalancedParens(url) { let balance = 0; for (const c of url) { if (c === "(") { balance++; } else if (c === ")") { balance--; if (balance < 0) { return true; } } } return balance !== 0; } /** * Bug 2017972 * Turn a URL token back into the text we want to show in the chat. * * Two cases: * 1. The token is sitting inside a markdown link, like * `[click](§url§)`. We drop the URL straight in. We replace any * characters that would confuse markdown's link parser with their * percent-encoded equivalents: * - spaces and `<` `>` always get encoded * - `(` and `)` only get encoded when the URL has an unbalanced * pair, which is what the parser actually trips on. URLs with * matched parens (Wikipedia etc.) are left alone so the link * looks normal on hover. * 2. The token is on its own, like `See §url§ for details`. We wrap * the URL in `<...>` so it renders as a clickable link. Spaces * and `<` `>` get percent-encoded so they can't break out of the * wrapper. * * @param {string} url - The URL the token resolves to. * @param {string} recentPlain - Tail of recently emitted plain text, * used to detect whether the token sits inside a markdown link. * @returns {string} The text to inject into the streamed plain output. */ function expandUrlToken(url, recentPlain) { const isMarkdownLink = AT_MARKDOWN_LINK_URL_RE.test(recentPlain); if (isMarkdownLink) { const encodePattern = hasUnbalancedParens(url) ? /[\s<>()]/g : /[\s<>]/g; return url.replace(encodePattern, encodeForLink); } return `<${url.replace(/[\s<>]/g, encodeForLink)}>`; } /** * Creates a new token stream parser state object. * * @returns {{ * inToken: boolean, * tokenBuffer: string, * tokenCandidate: boolean, * pendingOpen: boolean, * recentPlain: string * }} Parser state with token tracking. */ export function createParserState() { return { // Indicates if we are currently inside a token inToken: false, // Buffer to accumulate token data tokenBuffer: "", // Indicates if the current token is still a candidate for being valid tokenCandidate: false, // Indicates if there is a pending opening token character to process pendingOpen: false, // The last few characters we just sent to the chat. We peek at // this when expanding a URL token to figure out whether the token // is inside a markdown link or standing on its own. recentPlain: "", }; } /** * Parses a raw token string into key-value pairs. * * @param {string} raw - Content between §...§, e.g. "search: query" * @returns {{key: string, value: string} | null} Parsed token with key and value, or null if invalid. */ export function parseToken(raw) { const text = String(raw ?? "").trim(); if (!text) { return null; } const colonIndex = text.indexOf(":"); if (colonIndex === -1) { return null; // require key:value } const key = text.slice(0, colonIndex).trim(); const value = text.slice(colonIndex + 1).trim(); if (!key) { return null; // prevent §: value§ } return { key, value }; } /** * Consumes a stream chunk and extracts tokens and plain text. * * Tokens are only recognized when the opening "§" is immediately followed by an * allowed token start (e.g. ALLOWED_TOKEN_STARTS). Otherwise the "§" * is treated as literal text and streaming continues without stalling. * * @param {string} chunk - The chunk of text to parse. * @param {{ * inToken: boolean, * tokenBuffer: string, * tokenCandidate: boolean, * pendingOpen: boolean, * recentPlain: string * }} state - Parser state object (mutated in place). * @param {Map} tokenToUrl - Map a URL token to the full URL. * @returns {{ * plainText: string, * tokens: Array<{key: string, value: string}> * }} Parsed plain text and tokens. */ export function consumeStreamChunk(chunk, state, tokenToUrl = new Map()) { const tokens = []; const plain = []; let chunkString = String(chunk ?? ""); // A TOKEN_CHARACTER was seen at the end of the last chunk; treat it as opening now. if (state.pendingOpen) { chunkString = TOKEN_CHARACTER + chunkString; state.pendingOpen = false; } // Process each character in the chunk for (let i = 0; i < chunkString.length; i++) { const char = chunkString[i]; const isTokenChar = char === TOKEN_CHARACTER; // ---- Normal character (not §) ---- if (!isTokenChar) { // Plain text mode if (!state.inToken) { pushPlain(plain, { state, str: char }); continue; } // Token mode: accumulate state.tokenBuffer += char; // If we already confirmed it's a real token, keep accumulating. if (!state.tokenCandidate) { continue; } // Candidate token: decide ASAP if it's real or literal. if ( state.tokenBuffer.length > MAX_START_LEN || !isAllowedPrefix(state.tokenBuffer) ) { pushPlain(plain, { state, str: TOKEN_CHARACTER + state.tokenBuffer }); state.inToken = false; state.tokenCandidate = false; state.tokenBuffer = ""; continue; } if (isExactAllowedStart(state.tokenBuffer)) { state.tokenCandidate = false; // confirmed } continue; } // ---- § character ---- // Opening § if (!state.inToken) { // If § is the last char in this chunk, defer the decision to the next chunk. if (i === chunkString.length - 1) { state.pendingOpen = true; continue; } state.inToken = true; state.tokenCandidate = true; state.tokenBuffer = ""; continue; } // Closing § (we were inToken) if (state.tokenCandidate) { // Never confirmed allowed start => literal text, don't stall streaming. pushPlain(plain, { state, str: TOKEN_CHARACTER + state.tokenBuffer + TOKEN_CHARACTER, }); } else { try { const parsed = parseToken(state.tokenBuffer); if (parsed?.key == "url_token") { // Eagerly convert the url_token back to its original URL. The URL tokens // should only exist during the conversation to the language model. They // get expanded everywhere else in the system for URL security tracking // and the rendering of messages for users. const url = tokenToUrl.get(parsed.value); if (url) { pushPlain(plain, { state, str: expandUrlToken(url, state.recentPlain), }); } } else if (parsed) { tokens.push(parsed); } } catch { // Do nothing. } } state.inToken = false; state.tokenCandidate = false; state.tokenBuffer = ""; } return { plainText: plain.join(""), tokens }; } /** * Flushes any remaining unclosed token or pending section symbol as literal text. * * @param {{ * inToken: boolean, * tokenBuffer: string, * tokenCandidate: boolean, * pendingOpen: boolean, * recentPlain: string * }} state - Parser state object (mutated in place). * @returns {string} Literal text for any unflushed remainder, or an empty string. */ export function flushTokenRemainder(state) { let out = ""; if (state.pendingOpen) { out += TOKEN_CHARACTER; state.pendingOpen = false; } if (state.inToken) { out += TOKEN_CHARACTER + state.tokenBuffer; state.inToken = false; state.tokenCandidate = false; state.tokenBuffer = ""; } return out; }