/**
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

export const TOKEN_CHARACTER = "§";
const ALLOWED_TOKEN_STARTS = [
  "search:",
  "existing_memory:",
  "followup:",
  "url_token:",
  "kit:",
];
const MAX_START_LEN = Math.max(
  ...ALLOWED_TOKEN_STARTS.map(string => string.length)
);

// Keep a tail of recently emitted plain text so that when a URL token
// arrives we can peek backwards to tell whether the token is inside a
// markdown link `[click](URL)` or sitting on its own. 16 chars is
// enough even with whitespace between `](` and the token.
const URL_CONTEXT_LOOKBACK_CHARS = 16;

// Matches when the recent emitted text ends in `](` (with optional
// whitespace) — meaning the URL portion of a markdown link comes next.
const AT_MARKDOWN_LINK_URL_RE = /\]\(\s*$/;

// `encodeURIComponent` leaves `(` and `)` alone because they are
// "sub-delimiters" per RFC 3986, not characters it needs to escape.
// They are valid in URLs, but they collide with markdown's
// `[text](dest)` delimiters, which is what we're working around here.
const ENCODE_FOR_LINK_OVERRIDES = { "(": "%28", ")": "%29" };

function encodeForLink(c) {
  return ENCODE_FOR_LINK_OVERRIDES[c] ?? encodeURIComponent(c);
}

function isAllowedPrefix(string) {
  return ALLOWED_TOKEN_STARTS.some(start => start.startsWith(string));
}

function isExactAllowedStart(string) {
  return ALLOWED_TOKEN_STARTS.includes(string);
}

function pushPlain(plain, { state, str } = {}) {
  plain.push(str);
  state.recentPlain = (state.recentPlain + str).slice(
    -URL_CONTEXT_LOOKBACK_CHARS
  );
}

function hasUnbalancedParens(url) {
  let balance = 0;
  for (const c of url) {
    if (c === "(") {
      balance++;
    } else if (c === ")") {
      balance--;
      if (balance < 0) {
        return true;
      }
    }
  }
  return balance !== 0;
}

/**
 * Bug 2017972
 * Turn a URL token back into the text we want to show in the chat.
 *
 * Two cases:
 *   1. The token is sitting inside a markdown link, like
 *      `[click](§url§)`. We drop the URL straight in. We replace any
 *      characters that would confuse markdown's link parser with their
 *      percent-encoded equivalents:
 *        - spaces and `<` `>` always get encoded
 *        - `(` and `)` only get encoded when the URL has an unbalanced
 *          pair, which is what the parser actually trips on. URLs with
 *          matched parens (Wikipedia etc.) are left alone so the link
 *          looks normal on hover.
 *   2. The token is on its own, like `See §url§ for details`. We wrap
 *      the URL in `<...>` so it renders as a clickable link. Spaces
 *      and `<` `>` get percent-encoded so they can't break out of the
 *      wrapper.
 *
 * @param {string} url - The URL the token resolves to.
 * @param {string} recentPlain - Tail of recently emitted plain text,
 *   used to detect whether the token sits inside a markdown link.
 * @returns {string} The text to inject into the streamed plain output.
 */
function expandUrlToken(url, recentPlain) {
  const isMarkdownLink = AT_MARKDOWN_LINK_URL_RE.test(recentPlain);

  if (isMarkdownLink) {
    const encodePattern = hasUnbalancedParens(url) ? /[\s<>()]/g : /[\s<>]/g;
    return url.replace(encodePattern, encodeForLink);
  }

  return `<${url.replace(/[\s<>]/g, encodeForLink)}>`;
}

/**
 * Creates a new token stream parser state object.
 *
 * @returns {{
 *   inToken: boolean,
 *   tokenBuffer: string,
 *   tokenCandidate: boolean,
 *   pendingOpen: boolean,
 *   recentPlain: string
 * }} Parser state with token tracking.
 */
export function createParserState() {
  return {
    // Indicates if we are currently inside a token
    inToken: false,
    // Buffer to accumulate token data
    tokenBuffer: "",
    // Indicates if the current token is still a candidate for being valid
    tokenCandidate: false,
    // Indicates if there is a pending opening token character to process
    pendingOpen: false,
    // The last few characters we just sent to the chat. We peek at
    // this when expanding a URL token to figure out whether the token
    // is inside a markdown link or standing on its own.
    recentPlain: "",
  };
}

/**
 * Parses a raw token string into key-value pairs.
 *
 * @param {string} raw - Content between §...§, e.g. "search: query"
 * @returns {{key: string, value: string} | null} Parsed token with key and value, or null if invalid.
 */
export function parseToken(raw) {
  const text = String(raw ?? "").trim();

  if (!text) {
    return null;
  }

  const colonIndex = text.indexOf(":");
  if (colonIndex === -1) {
    return null; // require key:value
  }

  const key = text.slice(0, colonIndex).trim();
  const value = text.slice(colonIndex + 1).trim();

  if (!key) {
    return null; // prevent §: value§
  }

  return { key, value };
}

/**
 * Consumes a stream chunk and extracts tokens and plain text.
 *
 * Tokens are only recognized when the opening "§" is immediately followed by an
 * allowed token start (e.g. ALLOWED_TOKEN_STARTS). Otherwise the "§"
 * is treated as literal text and streaming continues without stalling.
 *
 * @param {string} chunk - The chunk of text to parse.
 * @param {{
 *   inToken: boolean,
 *   tokenBuffer: string,
 *   tokenCandidate: boolean,
 *   pendingOpen: boolean,
 *   recentPlain: string
 * }} state - Parser state object (mutated in place).
 * @param {Map<string, string>} tokenToUrl - Map a URL token to the full URL.
 * @returns {{
 *   plainText: string,
 *   tokens: Array<{key: string, value: string}>
 * }} Parsed plain text and tokens.
 */
export function consumeStreamChunk(chunk, state, tokenToUrl = new Map()) {
  const tokens = [];
  const plain = [];

  let chunkString = String(chunk ?? "");

  // A TOKEN_CHARACTER was seen at the end of the last chunk; treat it as opening now.
  if (state.pendingOpen) {
    chunkString = TOKEN_CHARACTER + chunkString;
    state.pendingOpen = false;
  }

  // Process each character in the chunk
  for (let i = 0; i < chunkString.length; i++) {
    const char = chunkString[i];
    const isTokenChar = char === TOKEN_CHARACTER;

    // ---- Normal character (not §) ----
    if (!isTokenChar) {
      // Plain text mode
      if (!state.inToken) {
        pushPlain(plain, { state, str: char });
        continue;
      }

      // Token mode: accumulate
      state.tokenBuffer += char;

      // If we already confirmed it's a real token, keep accumulating.
      if (!state.tokenCandidate) {
        continue;
      }

      // Candidate token: decide ASAP if it's real or literal.
      if (
        state.tokenBuffer.length > MAX_START_LEN ||
        !isAllowedPrefix(state.tokenBuffer)
      ) {
        pushPlain(plain, { state, str: TOKEN_CHARACTER + state.tokenBuffer });
        state.inToken = false;
        state.tokenCandidate = false;
        state.tokenBuffer = "";
        continue;
      }

      if (isExactAllowedStart(state.tokenBuffer)) {
        state.tokenCandidate = false; // confirmed
      }
      continue;
    }

    // ---- § character ----

    // Opening §
    if (!state.inToken) {
      // If § is the last char in this chunk, defer the decision to the next chunk.
      if (i === chunkString.length - 1) {
        state.pendingOpen = true;
        continue;
      }

      state.inToken = true;
      state.tokenCandidate = true;
      state.tokenBuffer = "";
      continue;
    }

    // Closing § (we were inToken)
    if (state.tokenCandidate) {
      // Never confirmed allowed start => literal text, don't stall streaming.
      pushPlain(plain, {
        state,
        str: TOKEN_CHARACTER + state.tokenBuffer + TOKEN_CHARACTER,
      });
    } else {
      try {
        const parsed = parseToken(state.tokenBuffer);

        if (parsed?.key == "url_token") {
          // Eagerly convert the url_token back to its original URL. The URL tokens
          // should only exist during the conversation to the language model. They
          // get expanded everywhere else in the system for URL security tracking
          // and the rendering of messages for users.
          const url = tokenToUrl.get(parsed.value);
          if (url) {
            pushPlain(plain, {
              state,
              str: expandUrlToken(url, state.recentPlain),
            });
          }
        } else if (parsed) {
          tokens.push(parsed);
        }
      } catch {
        // Do nothing.
      }
    }

    state.inToken = false;
    state.tokenCandidate = false;
    state.tokenBuffer = "";
  }

  return { plainText: plain.join(""), tokens };
}

/**
 * Flushes any remaining unclosed token or pending section symbol as literal text.
 *
 * @param {{
 *   inToken: boolean,
 *   tokenBuffer: string,
 *   tokenCandidate: boolean,
 *   pendingOpen: boolean,
 *   recentPlain: string
 * }} state - Parser state object (mutated in place).
 * @returns {string} Literal text for any unflushed remainder, or an empty string.
 */
export function flushTokenRemainder(state) {
  let out = "";

  if (state.pendingOpen) {
    out += TOKEN_CHARACTER;
    state.pendingOpen = false;
  }

  if (state.inToken) {
    out += TOKEN_CHARACTER + state.tokenBuffer;
    state.inToken = false;
    state.tokenCandidate = false;
    state.tokenBuffer = "";
  }

  return out;
}