/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ // @ts-check /** * @see {extractTextFromDOM} for a high level overview of this file. */ /** * @import { GetTextOptions } from './PageExtractor.d.ts' */ const WHITESPACE_REGEX = /\s+/g; const MARKDOWN_TEXT_ESCAPE_REGEX = /[\[\]()]/g; const OPEN_PAREN_REGEX = /\(/g; const CLOSE_PAREN_REGEX = /\)/g; /** * The context for extracting text content from the DOM. */ class ExtractionContext { /** * Set of nodes that have already been processed, used to avoid duplicating text extraction. * * @type {Set} */ #processedNodes = new Set(); /** * The text-extraction options, provided at initialization. * * @type {GetTextOptions} */ #options; /** * The accumulated text content that has been extracted from the DOM. * * @type {string} */ #textContent = ""; /** * @type {Set} */ #links = new Set(); /** * When extracting content just from the viewport, this value will be set. * * @type {{ top: number; left: number; right: number; bottom: number } | null} */ #viewportRect = null; /** * Constructs a new extraction context with the provided options. * * @param {Document} document * @param {GetTextOptions} options */ constructor(document, options) { this.#options = options; if (options.justViewport) { const { visualViewport } = document.defaultView; const { offsetTop, offsetLeft, width, height } = visualViewport; this.#viewportRect = { top: offsetTop, left: offsetLeft, right: offsetLeft + width, bottom: offsetTop + height, }; } } /** * Accumulated text content produced during traversal. * * @returns {string} */ get textContent() { return this.#textContent; } /** * @returns {string[]} */ get links() { return Array.from(this.#links); } /** * @param {string} href */ maybeAddLink(href) { this.#links.add(href); } /** * If this node is an anchor element, add its href to the links set. * Used for container nodes that will be subdivided, to capture anchors * that wrap block-level content. * * @param {Node} node */ addLinkIfAnchor(node) { const element = asElement(node); if (element?.nodeName === "A") { const href = /** @type {HTMLAnchorElement} */ (element).href; if (href) { this.maybeAddLink(href); } } } /** * Extract all links from a node using querySelector. * Should only be called on leaf/accepted blocks, not on containers * that will be subdivided. * * @param {Node} node */ extractLinksFromBlock(node) { const element = asElement(node); if (!element) { return; } // If the node itself is an anchor, add its href if (element.nodeName === "A") { // Check raw attribute first to avoid URL resolution if not needed if (element.hasAttribute("href")) { const href = /** @type {HTMLAnchorElement} */ (element).href; if (href) { this.maybeAddLink(href); } } } else { // Check ancestor anchors (for anchors wrapping block content) // Skip for top-level elements that can't be inside anchors const { nodeName } = element; if (nodeName !== "BODY" && nodeName !== "HTML") { const ancestorAnchor = element.closest("a"); if (ancestorAnchor?.hasAttribute("href")) { const href = ancestorAnchor.href; if (href) { this.maybeAddLink(href); } } } } // Extract links from anchor descendants const anchors = element.getElementsByTagName("a"); for (let i = 0, len = anchors.length; i < len; i++) { const anchor = anchors[i]; // Check raw attribute first to avoid URL resolution if not needed if (anchor.hasAttribute("href")) { const href = anchor.href; if (href) { this.maybeAddLink(href); } } } } /** * Returns true if a condition has been met such that the text * extraction should stop early, otherwise false. * * @returns {boolean} */ shouldStopExtraction() { const { sufficientLength } = this.#options; if ( sufficientLength !== undefined && this.#textContent.length >= sufficientLength ) { return true; } return false; } /** * Returns true if this node or its ancestor's text content has * already been extracted from the DOM. * * @param {Node} node */ #isNodeProcessed(node) { if (this.#processedNodes.has(node)) { return true; } for (const ancestor of getAncestorsIterator(node)) { if (this.#processedNodes.has(ancestor)) { return true; } } return false; } /** * When capturing content only in the viewport, skip nodes that are outside of it. * * @param {Node} node */ maybeOutOfViewport(node) { if (!this.#viewportRect) { // We don't have a viewport rect, so skip this check. return false; } const element = getHTMLElementForStyle(node); if (!element) { return false; } const rect = element.getBoundingClientRect(); if (!rect) { return false; } return ( rect.bottom <= this.#viewportRect.top || rect.top >= this.#viewportRect.bottom || rect.right <= this.#viewportRect.left || rect.left >= this.#viewportRect.right ); } /** * Append the node's text content to the accumulated text only if the node * itself as well as no ancestor of the node has already been processed. * * @param {Node} node */ maybeAppendTextContent(node) { if (this.#isNodeProcessed(node)) { return; } this.#processedNodes.add(node); if (isNodeHidden(node)) { return; } if (this.maybeOutOfViewport(node)) { // This only can return true when we're capturing just the viewport nodes. return; } const element = asHTMLElement(node); const text = asTextNode(node); let innerText = ""; if (element) { if (this.#hasInlineAnchors(element)) { innerText = this.#extractTextWithMarkdownLinks(element); } else { innerText = element.innerText.trim(); } } else if (text?.nodeValue) { innerText = text.nodeValue.trim(); } if (innerText) { this.#textContent += "\n" + innerText; } } /** * Check if a block contains any inline anchors that should be formatted as markdown. * Anchors that wrap block content are excluded since they will be handled by * the block splitting strategy. * * @param {HTMLElement} element * @returns {boolean} */ #hasInlineAnchors(element) { if (element.nodeName === "A") { return !this.#wrapsBlockContent(element); } const anchors = element.querySelectorAll("a"); for (const anchor of anchors) { if (!this.#wrapsBlockContent(anchor)) { return true; } } return false; } /** * Extract text from an element, formatting inline anchors as markdown. * Uses a TreeWalker to traverse the content in document order without * cloning or modifying the DOM. * * @param {HTMLElement} element * @returns {string} */ #extractTextWithMarkdownLinks(element) { // Handle the simple case where the element itself is an inline anchor if (element.nodeName === "A" && !this.#wrapsBlockContent(element)) { return this.#formatAnchorAsMarkdown(element); } const parts = []; this.#walkAndExtract(element, parts); // Normalize whitespace for clean output return parts.join("").replace(WHITESPACE_REGEX, " ").trim(); } /** * Recursively walk the DOM and extract text, formatting inline anchors as markdown. * * @param {Node} node * @param {string[]} parts */ #walkAndExtract(node, parts) { if (node.nodeType === Node.TEXT_NODE) { const text = node.nodeValue ?? ""; if (text) { parts.push(text); } return; } const element = asElement(node); if (!element) { return; } // If this is an anchor, check if it wraps block content if (element.nodeName === "A") { if (this.#wrapsBlockContent(element)) { // Anchor wraps block content - extract children normally without markdown for (const child of element.childNodes) { this.#walkAndExtract(child, parts); } } else { // Inline anchor - format as markdown parts.push(this.#formatAnchorAsMarkdown(element)); } return; } // For other elements, recurse into children for (const child of element.childNodes) { this.#walkAndExtract(child, parts); } } /** * Format an anchor element as markdown [text](url). * Uses the resolved href property for the URL to get absolute URLs. * * @param {HTMLAnchorElement} anchor * @returns {string} */ #formatAnchorAsMarkdown(anchor) { // Normalize whitespace in link text for clean markdown output // e.g., Some \n text becomes [Some text](url) let linkText = (anchor.textContent ?? "") .replace(WHITESPACE_REGEX, " ") .trim(); // For image-only anchors, use alt text if available if (!linkText) { const img = anchor.querySelector("img"); if (img) { linkText = (img.alt ?? "").trim(); } } // No text means we can't produce meaningful markdown if (!linkText) { return ""; } // Use anchor.href which provides the resolved (absolute) URL. // Empty href resolves to the current document URL, which is valid. const href = anchor.href; if (!href) { return linkText; } // Escape brackets and parentheses in link text, and parentheses in URL for valid markdown const escapedText = linkText.replace(MARKDOWN_TEXT_ESCAPE_REGEX, "\\$&"); const escapedHref = href .replace(OPEN_PAREN_REGEX, "%28") .replace(CLOSE_PAREN_REGEX, "%29"); return `[${escapedText}](${escapedHref})`; } /** * Check if an anchor element wraps block-level content. * Such anchors should not be formatted as markdown since their * content will be extracted separately by the block splitting strategy. * Checks recursively to handle cases like
...
. * * @param {Element} element * @returns {boolean} */ #wrapsBlockContent(element) { for (const child of element.childNodes) { const childElement = asElement(child); if (!childElement) { continue; } if (getIsBlockLike(childElement)) { return true; } // Recursively check inline children for nested block content if (this.#wrapsBlockContent(childElement)) { return true; } } return false; } } /** * Extracts visible text content from the DOM. * By default, this extracts content from the entire page. * * Callers may specify filters for the extracted text via * the supported options @see {GetTextOptions}. * * @param {Document} document * @param {GetTextOptions} options * * @returns {{ text: string, links: string[] }} * * In-depth documentation: * * Webpages are complicated documents. There are many different semantic structures * like
, aria controls or even specifications like schema.org. The DOMExtractor * can use these as hints, but ultimately the goal is to extract the user visible text * from a webpage in the same way it is presented to the user. Text in layout is done * through inline elements that go through reflow within a block. The intent of this * algorithm is to collect all of the blocks on the screen, and convert each block into * a paragraph of plain text that is representative of the information that is displayed * on the screen. * * For example: * *
*
* This is an example of a block with inline elements. *
* * The
computed style
is respected for extraction. *
*
* Only visible text will be extracted. *
*
* * If extraction is run on this document you will get the following lines: * * ``` * This is an example of a block with inline elements.\n * The computed style is respected for extraction.\n * ``` * * This text should be formatted in a way that a language model can infer the meaning * of the page, and work efficiently with the returned structure. A user reads and * understands the content of the page based on how it's displayed to them. Therefore * a language model should get plain text that as closely resembles that. * * The DOMExtractor supports different modes to limit the amount of content, or provide * only information that is in the viewport. Ultimately it should be able to take any * type of request from things like the get_page_content tool call, and fulfill that * request in an efficient way that returns content as much as possible as how a user * would actually experience it once rendered to the page. * * This strategy differs from more traditional scraping methods, as the browser has * access to the full styled page. We can measure the computed style of elements to * determine visibility and the actually computed block status (e.g. "display: block" * and "display: inline") */ export function extractTextFromDOM(document, options) { const context = new ExtractionContext(document, options); subdivideAndExtractText(document.body, context); return { text: context.textContent.trim(), links: context.links, }; } /** * Tags excluded from text extraction. */ const CONTENT_EXCLUDED_TAGS = new Set([ // TODO - We should add this and write some tests. "CODE", // The following are deprecated tags. "DIR", "APPLET", // The following are embedded elements, and are not supported (yet). "MATH", "EMBED", "OBJECT", "IFRAME", // This is an SVG tag that can contain arbitrary XML, ignore it. "METADATA", // These are elements that are treated as opaque by Firefox which causes their // innerHTML property to be just the raw text node behind it. Any text that is sent as // HTML must be valid, and there is no guarantee that the innerHTML is valid. "NOSCRIPT", "NOEMBED", "NOFRAMES", // Do not parse the HEAD tag. "HEAD", // These are not user-visible tags. "STYLE", "SCRIPT", "TEMPLATE", ]); const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(","); /** * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API. * This allows for extracting the content from WebComponents, which is not * normally feasible in non-privileged contexts. * * @param {Node} node * * @returns {ShadowRoot | null} */ function getShadowRoot(node) { return asElement(node)?.openOrClosedShadowRoot ?? null; } /** * Determines if a node is ready for text extraction, or if it should be subdivided * further. It doesn't check if the node has already been processed. This id done * at the block level. * * @param {Node} node * @returns {number} - NodeFilter acceptance status. */ function determineBlockStatus(node) { if (!node) { return NodeFilter.FILTER_REJECT; } if (getShadowRoot(node)) { return NodeFilter.FILTER_ACCEPT; } if (isExcludedNode(node)) { // This is an explicit. return NodeFilter.FILTER_REJECT; } if ( containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) && !hasNonWhitespaceTextNodes(node) ) { // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract. return NodeFilter.FILTER_SKIP; } if (nodeNeedsSubdividing(node)) { // Skip this node, and dig deeper into its tree to cut off smaller pieces // to extract. It is presumed to be a wrapper of block elements. return NodeFilter.FILTER_SKIP; } // This textContent call is fairly expensive. if (!node.textContent?.trim().length) { // Check if this is an anchor with an image. // Accept these anchors so their links are captured, even without alt text. const element = asElement(node); if (element?.nodeName === "A") { const img = element.querySelector("img"); if (img) { return NodeFilter.FILTER_ACCEPT; } } // Do not use subtrees that are empty of text. return !node.hasChildNodes() ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_SKIP; } // This node can be treated as entire block and is ready for text extraction. return NodeFilter.FILTER_ACCEPT; } /** * Determine if this element is an inline element or a block element. * * @param {Node} node * @returns {boolean} */ function nodeNeedsSubdividing(node) { const element = asElement(node); if (!element) { // Only elements need to be further subdivided. return false; } for (let childNode of element.childNodes) { if (!childNode) { continue; } switch (childNode.nodeType) { case Node.TEXT_NODE: { // Keep checking for more inline or text nodes. continue; } case Node.ELEMENT_NODE: { if (getIsBlockLike(childNode)) { // This node is a block node, so it needs further subdividing. return true; } else if (nodeNeedsSubdividing(childNode)) { // This non-block-like node may contain other block-like nodes. return true; } // Keep checking for more inline or text nodes. continue; } default: { return true; } } } return false; } /** * Returns true if a node is hidden based on factors such as collapsed state and * computed style, otherwise false. * * @param {Node} node * @returns {boolean} */ function isNodeHidden(node) { const element = getHTMLElementForStyle(node); if (!element) { // If we cannot get an HTMLElement to check visibility, we should not // consider the node hidden. This can happen with cross-compartment // elements where HTMLElement.isInstance fails. return false; } // This is a cheap and easy check that will not compute style or force reflow. if (element.hidden) { // The element is explicitly hidden. return true; } // Handle open/closed
elements. This will also not compute style or force reflow. // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details if ( // The element is within a closed
element.closest("details:not([open])") && // The element is not part of the of the
, which is always visible, even when closed. !element.closest("summary") ) { // The element is within a closed
and is not part of the , therefore it is not visible. return true; } // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible. // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10 if ( !( element.offsetWidth || element.offsetHeight || element.getClientRects().length ) ) { return true; } // The element may still have a zero-sized bounding client rectangle. const boundingClientRect = element.getBoundingClientRect(); if ( boundingClientRect && (boundingClientRect.width === 0 || boundingClientRect.height === 0) ) { return true; } const { ownerGlobal } = element; if (!ownerGlobal) { // We cannot compute the style without ownerGlobal, so we will assume it is not visible. return true; } // This flushes the style, which is a performance cost. const style = ownerGlobal.getComputedStyle(element); if (!style) { // We were unable to compute the style, so we will assume it is not visible. return true; } // This is an issue with the DOM library generation. const { display, visibility, opacity } = style; return ( display === "none" || visibility === "hidden" || visibility === "collapse" || opacity === "0" ); } /** * @param {Node} node */ function isExcludedNode(node) { // Property access be expensive, so destructure required properties so they are // not accessed multiple times. const { nodeType } = node; if (nodeType === Node.TEXT_NODE) { // Text nodes are never excluded. return false; } const element = asElement(node); if (!element) { // Only elements and and text nodes should be considered. return true; } const { nodeName } = element; if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) { // SVG tags can be lowercased, so ensure everything is uppercased. // This is an excluded tag. return true; } return false; } /** * Like `#isExcludedNode` but looks at the full subtree. Used to see whether * we can consider a subtree, or whether we should split it into smaller * branches first to try to exclude more of the content. * * @param {Node} node * @param {string} excludedNodeSelector * * @returns {boolean} */ function containsExcludedNode(node, excludedNodeSelector) { return Boolean(asElement(node)?.querySelector(excludedNodeSelector)); } /** * Test whether any of the direct child text nodes of are non-whitespace text nodes. * * For example: * - `

test

`: yes * - `

`: no * - `

test

`: no * * @param {Node} node * * @returns {boolean} */ function hasNonWhitespaceTextNodes(node) { if (node.nodeType !== Node.ELEMENT_NODE) { // Only check element nodes. return false; } for (const child of node.childNodes) { const textNode = asTextNode(child); if (textNode) { if (!textNode.textContent?.trim()) { // This is just whitespace. continue; } // A text node with content was found. return true; } } // No text nodes were found. return false; } /** * Start walking down through a node's subtree and decide which nodes to extract content * from. This first node is the root of the page. * * The nodes go through a process of subdivision until an appropriate sized chunk * of inline text can be found. * * @param {Node} node * @param {ExtractionContext} context */ function subdivideAndExtractText(node, context) { if (context.shouldStopExtraction()) { return; } switch (determineBlockStatus(node)) { case NodeFilter.FILTER_REJECT: { // This node is rejected as it shouldn't be used for text extraction. return; } // Either a shadow host or a block element case NodeFilter.FILTER_ACCEPT: { const shadowRoot = getShadowRoot(node); if (shadowRoot) { processSubdivide(shadowRoot, context); } else { context.extractLinksFromBlock(node); context.maybeAppendTextContent(node); } break; } case NodeFilter.FILTER_SKIP: { // This node may have text to extract, but it needs to be subdivided into smaller // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes // that contain enough inline elements to extract. // Only check if this node itself is an anchor (for anchors wrapping block content). // Don't scan descendants here - they'll be processed when child blocks are accepted. context.addLinkIfAnchor(node); processSubdivide(node, context); break; } } } /** * Add qualified nodes to have their text content extracted by recursively walking * through the DOM tree of nodes, including elements in the Shadow DOM. * * @param {Node} node * @param {ExtractionContext} context */ function processSubdivide(node, context) { if (context.shouldStopExtraction()) { return; } const { ownerDocument } = node; if (!ownerDocument) { return; } // This iterator will contain each node that has been subdivided enough to have its // text extracted. const nodeIterator = ownerDocument.createTreeWalker( node, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, determineBlockStatus ); let currentNode; while ((currentNode = nodeIterator.nextNode())) { const shadowRoot = getShadowRoot(currentNode); if (shadowRoot) { processSubdivide(shadowRoot, context); } else { context.extractLinksFromBlock(currentNode); context.maybeAppendTextContent(currentNode); } if (context.shouldStopExtraction()) { return; } } } /** * Returns an iterator of a node's ancestors. * * @param {Node} node * * @yields {Node} */ function* getAncestorsIterator(node) { const document = node.ownerDocument; if (!document) { return; } for ( let parent = node.parentNode; parent && parent !== document.documentElement; parent = parent.parentNode ) { yield parent; } } /** * Reads the elements computed style and determines if the element is a block-like * element or not. Every element that lays out like a block should be used as a unit * for text extraction. * * @param {Node} node * @returns {boolean} */ function getIsBlockLike(node) { const element = asElement(node); if (!element) { return false; } const { ownerGlobal } = element; if (!ownerGlobal) { return false; } if (element.namespaceURI === "http://www.w3.org/2000/svg") { // SVG elements will report as inline, but there is no block layout in SVG. // Treat every SVG element as being block so that every node will be subdivided. return true; } /** @type {Record} */ // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable. const style = ownerGlobal.getComputedStyle(element) ?? { display: null }; return style.display !== "inline" && style.display !== "none"; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null | undefined} node * @returns {Element | null} */ function asElement(node) { if (node?.nodeType === Node.ELEMENT_NODE) { return /** @type {HTMLElement} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null} node * * @returns {Text | null} */ function asTextNode(node) { if (node?.nodeType === Node.TEXT_NODE) { return /** @type {Text} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an HTMLElement. * * @param {Node | null} node * * @returns {HTMLElement | null} */ function asHTMLElement(node) { if (HTMLElement.isInstance(node)) { return node; } return null; } /** * This function returns the correct element to determine the * style of node. * * @param {Node} node * * @returns {HTMLElement | null} */ function getHTMLElementForStyle(node) { const element = asHTMLElement(node); if (element) { return element; } if (node.parentElement) { return asHTMLElement(node.parentElement); } // For cases like text node where its parent is ShadowRoot, // we'd like to use flattenedTreeParentNode if (node.flattenedTreeParentNode) { return asHTMLElement(node.flattenedTreeParentNode); } // If the text node is not connected or doesn't have a frame. return null; }