/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ // @ts-check /** * @param {Document} document * @returns {string} */ export function extractTextFromDOM(document) { const blocks = subdivideNodeIntoBlocks(document.body); let textContent = ""; for (const block of blocks) { let innerText = ""; const element = asHTMLElement(block); const text = asTextNode(block); if (element) { innerText = element.innerText.trim(); } else if (text?.nodeValue) { innerText = text.nodeValue.trim(); } if (innerText) { textContent += "\n" + innerText; } } return textContent; } /** * Tags excluded from text extraction. */ const CONTENT_EXCLUDED_TAGS = new Set([ // TODO - We should add this and write some tests. "CODE", // The following are deprecated tags. "DIR", "APPLET", // The following are embedded elements, and are not supported (yet). "MATH", "EMBED", "OBJECT", "IFRAME", // This is an SVG tag that can contain arbitrary XML, ignore it. "METADATA", // These are elements that are treated as opaque by Firefox which causes their // innerHTML property to be just the raw text node behind it. Any text that is sent as // HTML must be valid, and there is no guarantee that the innerHTML is valid. "NOSCRIPT", "NOEMBED", "NOFRAMES", // Do not parse the HEAD tag. "HEAD", // These are not user-visible tags. "STYLE", "SCRIPT", "TEMPLATE", ]); const CONTENT_EXCLUDED_NODE_SELECTOR = [...CONTENT_EXCLUDED_TAGS].join(","); /** * Get the ShadowRoot from the chrome-only openOrClosedShadowRoot API. * This allows for extracting the content from WebComponents, which is not * normally feasible in non-privileged contexts. * * @param {Node} node * * @returns {ShadowRoot | null} */ function getShadowRoot(node) { return asElement(node)?.openOrClosedShadowRoot ?? null; } /** * Determines if a node is ready for text extraction, or if it should be subdivided * further. It doesn't check if the node has already been processed. This id done * at the block level. * * @param {Node} node * @returns {number} - NodeFilter acceptance status. */ function determineBlockStatus(node) { if (!node) { return NodeFilter.FILTER_REJECT; } if (getShadowRoot(node)) { return NodeFilter.FILTER_ACCEPT; } if (isExcludedNode(node)) { // This is an explicit. return NodeFilter.FILTER_REJECT; } if ( containsExcludedNode(node, CONTENT_EXCLUDED_NODE_SELECTOR) && !hasNonWhitespaceTextNodes(node) ) { // Skip this node, and dig deeper into its tree to cut off smaller pieces to extract. return NodeFilter.FILTER_SKIP; } if (nodeNeedsSubdividing(node)) { // Skip this node, and dig deeper into its tree to cut off smaller pieces // to extract. It is presumed to be a wrapper of block elements. return NodeFilter.FILTER_SKIP; } // This textContent call is fairly expensive. if (!node.textContent?.trim().length) { // Do not use subtrees that are empty of text. return !node.hasChildNodes() ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_SKIP; } // This node can be treated as entire block and is ready for text extraction. return NodeFilter.FILTER_ACCEPT; } /** * Determine if this element is an inline element or a block element. * * @param {Node} node * @returns {boolean} */ function nodeNeedsSubdividing(node) { const element = asElement(node); if (!element) { // Only elements need to be further subdivided. return false; } for (let childNode of element.childNodes) { if (!childNode) { continue; } switch (childNode.nodeType) { case Node.TEXT_NODE: { // Keep checking for more inline or text nodes. continue; } case Node.ELEMENT_NODE: { if (getIsBlockLike(childNode)) { // This node is a block node, so it needs further subdividing. return true; } else if (nodeNeedsSubdividing(childNode)) { // This non-block-like node may contain other block-like nodes. return true; } // Keep checking for more inline or text nodes. continue; } default: { return true; } } } return false; } /** * Returns true if an HTML element is hidden based on factors such as collapsed state and * computed style, otherwise false. * * @param {HTMLElement} element * @returns {boolean} */ function isHTMLElementHidden(element) { // This is a cheap and easy check that will not compute style or force reflow. if (element.hidden) { // The element is explicitly hidden. return true; } // Handle open/closed
elements. This will also not compute style or force reflow. // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/details if ( // The element is within a closed
element.closest("details:not([open])") && // The element is not part of the of the
, which is always visible, even when closed. !element.closest("summary") ) { // The element is within a closed
and is not part of the , therefore it is not visible. return true; } // This forces reflow, which has a performance cost, but this is also what JQuery uses for its :hidden and :visible. // https://github.com/jquery/jquery/blob/bd6b453b7effa78b292812dbe218491624994526/src/css/hiddenVisibleSelectors.js#L1-L10 if ( !( element.offsetWidth || element.offsetHeight || element.getClientRects().length ) ) { return true; } const { ownerGlobal } = element; if (!ownerGlobal) { // We cannot compute the style without ownerGlobal, so we will assume it is not visible. return true; } // This flushes the style, which is a performance cost. const style = ownerGlobal.getComputedStyle(element); if (!style) { // We were unable to compute the style, so we will assume it is not visible. return true; } // This is an issue with the DOM library generation. // @ts-expect-error Property 'display' does not exist on type 'CSSStyleDeclaration'.ts(2339) const { display, visibility, opacity } = style; return ( display === "none" || visibility === "hidden" || visibility === "collapse" || opacity === "0" ); } /** * @param {Node} node */ function isExcludedNode(node) { // Property access be expensive, so destructure required properties so they are // not accessed multiple times. const { nodeType } = node; if (nodeType === Node.TEXT_NODE) { // Text nodes are never excluded. return false; } const element = asElement(node); if (!element) { // Only elements and and text nodes should be considered. return true; } const { nodeName } = element; if (CONTENT_EXCLUDED_TAGS.has(nodeName.toUpperCase())) { // SVG tags can be lowercased, so ensure everything is uppercased. // This is an excluded tag. return true; } return false; } /** * Like `#isExcludedNode` but looks at the full subtree. Used to see whether * we can consider a subtree, or whether we should split it into smaller * branches first to try to exclude more of the content. * * @param {Node} node * @param {string} excludedNodeSelector * * @returns {boolean} */ function containsExcludedNode(node, excludedNodeSelector) { return Boolean(asElement(node)?.querySelector(excludedNodeSelector)); } /** * Test whether any of the direct child text nodes of are non-whitespace text nodes. * * For example: * - `

test

`: yes * - `

`: no * - `

test

`: no * * @param {Node} node * * @returns {boolean} */ function hasNonWhitespaceTextNodes(node) { if (node.nodeType !== Node.ELEMENT_NODE) { // Only check element nodes. return false; } for (const child of node.childNodes) { const textNode = asTextNode(child); if (textNode) { if (!textNode.textContent?.trim()) { // This is just whitespace. continue; } // A text node with content was found. return true; } } // No text nodes were found. return false; } /** * Start walking down through a node's subtree and decide which nodes to extract content * from. This first node is the root of the page. * * The nodes go through a process of subdivision until an appropriate sized chunk * of inline text can be found. * * @param {Node} node * @returns {Set} */ function subdivideNodeIntoBlocks(node) { /** @type {Set} */ const blocks = new Set(); switch (determineBlockStatus(node)) { case NodeFilter.FILTER_REJECT: { // This node is rejected as it shouldn't be used for text extraction. return blocks; } // Either a shadow host or a block element case NodeFilter.FILTER_ACCEPT: { const shadowRoot = getShadowRoot(node); if (shadowRoot) { processSubdivide(shadowRoot, blocks); } else { const element = asHTMLElement(node); if (element && isHTMLElementHidden(element)) { break; } if (noAncestorsAdded(node, blocks)) { blocks.add(node); } } break; } case NodeFilter.FILTER_SKIP: { // This node may have text to extract, but it needs to be subdivided into smaller // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes // that contain enough inline elements to extract. processSubdivide(node, blocks); break; } } return blocks; } /** * Add qualified nodes to have their text content extracted by recursively walking * through the DOM tree of nodes, including elements in the Shadow DOM. * * @param {Node} node * @param {Set} blocks */ function processSubdivide(node, blocks) { const { ownerDocument } = node; if (!ownerDocument) { return; } // This iterator will contain each node that has been subdivided enough to have its // text extracted. const nodeIterator = ownerDocument.createTreeWalker( node, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, determineBlockStatus ); let currentNode; while ((currentNode = nodeIterator.nextNode())) { const shadowRoot = getShadowRoot(currentNode); if (shadowRoot) { processSubdivide(shadowRoot, blocks); } else if (noAncestorsAdded(currentNode, blocks)) { blocks.add(currentNode); } } } /** * TODO - The original TranslationsDocument algorithm didn't require this, so perhaps * something was not ported correctly. This should be removed to see if the error * can be reproduced, and this mitigation removed. * * @param {Node} node * @param {Set} blocks */ function noAncestorsAdded(node, blocks) { for (const ancestor of getAncestorsIterator(node)) { if (blocks.has(ancestor)) { return false; } } return true; } /** * Returns an iterator of a node's ancestors. * * @param {Node} node * * @yields {Node} */ function* getAncestorsIterator(node) { const document = node.ownerDocument; if (!document) { return; } for ( let parent = node.parentNode; parent && parent !== document.documentElement; parent = parent.parentNode ) { yield parent; } } /** * Reads the elements computed style and determines if the element is a block-like * element or not. Every element that lays out like a block should be used as a unit * for text extraction. * * @param {Node} node * @returns {boolean} */ function getIsBlockLike(node) { const element = asElement(node); if (!element) { return false; } const { ownerGlobal } = element; if (!ownerGlobal) { return false; } if (element.namespaceURI === "http://www.w3.org/2000/svg") { // SVG elements will report as inline, but there is no block layout in SVG. // Treat every SVG element as being block so that every node will be subdivided. return true; } /** @type {Record} */ // @ts-expect-error - This is a workaround for the CSSStyleDeclaration not being indexable. const style = ownerGlobal.getComputedStyle(element) ?? { display: null }; return style.display !== "inline" && style.display !== "none"; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null | undefined} node * @returns {Element | null} */ function asElement(node) { if (node?.nodeType === Node.ELEMENT_NODE) { return /** @type {HTMLElement} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an Element. * * @param {Node | null} node * * @returns {Text | null} */ function asTextNode(node) { if (node?.nodeType === Node.TEXT_NODE) { return /** @type {Text} */ (node); } return null; } /** * Use TypeScript to determine if the Node is an HTMLElement. * * @param {Node | null} node * * @returns {HTMLElement | null} */ function asHTMLElement(node) { if (HTMLElement.isInstance(node)) { return node; } return null; }