/* Any copyright is dedicated to the Public Domain. https://creativecommons.org/publicdomain/zero/1.0/ */ "use strict"; /** * @import { BrowserTestUtils } from "../../../../../testing/mochitest/BrowserTestUtils/BrowserTestUtils.sys.mjs" * @import { PageExtractorParent } from "../../PageExtractorParent.sys.mjs" */ add_task(async function test_dom_extractor_default_options() { const { actor, cleanup } = await html`

Hello World

This is a paragraph

`; is( (await actor.getText()).text, ["Hello World", "This is a paragraph"].join("\n"), "Text can be extracted from the page." ); is( (await actor.getReaderModeContent(true /* force */)).text, "Hello World\nThis is a paragraph", "Reader mode can extract page content." ); Assert.deepEqual( await actor.getReaderModeContent(), { text: "", links: [] }, "Empty result is returned on non-reader mode content." ); return cleanup(); }); add_task(async function test_dom_extractor_sufficient_length_option() { const { actor, cleanup } = await html`

Hello World

First paragraph.

Second paragraph.

`; const header = "Hello World"; const headerAndP1 = ["Hello World", "First paragraph."].join("\n"); const allText = ["Hello World", "First paragraph.", "Second paragraph."].join( "\n" ); is( (await actor.getText()).text, allText, "All text is returned with the default options." ); const max = allText.length + 1; const expectations = [ [length => length === 0, ""], [length => length > 0 && length <= 12, header], [length => length > 12 && length <= 29, headerAndP1], [length => length > 29 && length <= max, allText], ]; for (let sufficientLength = 0; sufficientLength <= max; ++sufficientLength) { let expectedValue; for (const [predicate, value] of expectations) { if (predicate(sufficientLength)) { expectedValue = value; } } is( (await actor.getText({ sufficientLength })).text, expectedValue, `The text, given sufficientLength of ${sufficientLength}, matches the expectation.` ); } return cleanup(); }); add_task( async function test_dom_extractor_ignores_hidden_and_collapsed_nodes() { const { actor, cleanup } = await html`

Visible Title

Visible paragraph

Hidden via visibility:hidden

Hidden via opacity:0

Inline text within zero-sized block container

Inline text within zero-width (height>0) block container

Inline text within zero-height (width>0) block container

Block text within zero-sized inline container

Visible container outer text (hidden descendant)

Visible inline outer text (hidden descendant)

Summary is visible

Hidden inside closed details

Text node directly under closed details (hidden)

`; const expected = [ "Visible Title", "Visible paragraph", "Block text within zero-sized inline container", "Visible container outer text (hidden descendant)", "Visible inline outer text (hidden descendant)", "Summary is visible", ].join("\n"); is( (await actor.getText()).text, expected, "The extractor returns only visible text." ); return cleanup(); } ); add_task(async function test_dom_extractor_inline_batching() { const { actor, cleanup } = await html`

This is a simple section.

This entire section continues in a batch.

`; is( (await actor.getText()).text, [ "This is a simple section.", "[This entire](http://example.com/) section continues in a batch.", ].join("\n"), "Inline content is grouped within block elements." ); return cleanup(); }); // Tests comprehensive anchor element handling per the HTML specification. // https://html.spec.whatwg.org/multipage/links.html // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/a add_task(async function test_dom_extractor_link_anchors() { const { actor, cleanup } = await html`

Comprehensive Anchor Test

Reach us via Email.

New Tab Link

Download Attribute

Card Title

Card description.

Read bold text and italic text inside.

`; const result = await actor.getText(); const { text, links } = result; const actualLines = text .split("\n") .map(line => line.trim()) .filter(line => !!line.length); Assert.deepEqual( actualLines, [ "Comprehensive Anchor Test", "Reach us via [Email](mailto:user@mozilla.org).", "[New Tab Link](https://example.com/external)", "[Download Attribute](https://example.com/files/report.pdf)", "Card Title", "Card description.", "[Read bold text and italic text inside.](https://example.com/mixed)", "[Company Logo](https://example.com/with-alt)", ], "Text output matches expected markdown format with various anchor types" ); Assert.deepEqual( links, [ "mailto:user@mozilla.org", "https://example.com/external", "https://example.com/files/report.pdf", "https://example.com/card", "https://example.com/mixed", "https://example.com/no-alt", "https://example.com/with-alt", ], "Links array contains all extracted href values" ); await cleanup(); }); // Test that empty href resolves to current page URL via .href property add_task(async function test_dom_extractor_empty_href() { const { actor, cleanup } = await html`

Empty Href Link

`; const result = await actor.getText(); const { text, links } = result; // Empty href resolves to current page URL via .href property Assert.ok( text.includes("[Empty Href Link](http"), `Empty href formatted as markdown with resolved URL: ${text}` ); Assert.equal(links.length, 1, "One link extracted"); Assert.ok( links[0].startsWith("http"), `Empty href resolves to page URL: ${links[0]}` ); await cleanup(); }); // Original test case from Bug 1995618 - validates the core requirement add_task(async function test_dom_extractor_links() { const { actor, cleanup } = await html`

Example of Links

Here is the First link
Now this is an external link

`; const { text, links } = await actor.getText(); const lines = text.split("\n").filter(l => l.trim()); Assert.deepEqual( lines, [ "Example of Links", "Here is the [First link](https://example.com/first)", "Now this is an [external link](https://example.com/link)", ], "Text output matches expected markdown format" ); Assert.deepEqual( links, ["https://example.com/first", "https://example.com/link"], "Links array contains extracted href values" ); return cleanup(); }); add_task(async function test_dom_extractor_inline_block_styling() { const { actor, cleanup } = await html` Bare text is sent in a batch. Inline text at the root is sent in a batch.

Display "block" overrides the inline designation.

`; is( (await actor.getText()).text, [ "Bare text is sent in a batch.", "Inline text at the root is sent in a batch.", 'Display "block"', "overrides the inline designation.", ].join("\n"), "Inline and block styling are extracted as separate blocks." ); return cleanup(); }); add_task(async function test_extractor_edge_cases() { const { actor, cleanup } = await html`

Link with [Brackets] Link with (Parens) URL with Parens

Multiline Href

Card Title

Description

`; const result = await actor.getText(); const { text, links } = result; const lines = text .split("\n") .map(l => l.trim()) .filter(l => l.length); // Assert on the entire text output - markdown-formatted with escaped special characters // and block-level elements extracted as separate lines Assert.deepEqual( lines, [ "[Link with \\[Brackets\\]](https://example.com/1) [Link with \\(Parens\\)](https://example.com/2) [URL with Parens](https://en.wikipedia.org/wiki/HTML_%28standard%29)", "[Multiline Href](https://example.com/messy)", "Card Title", "Description", ], "Text output matches expected markdown format with escaped characters" ); // Assert on the entire links array - .href provides normalized absolute URLs Assert.deepEqual( links, [ "https://example.com/1", "https://example.com/2", "https://en.wikipedia.org/wiki/HTML_(standard)", "https://example.com/messy", "https://example.com/card", ], "Links array contains all extracted href values" ); await cleanup(); }); // Test nested anchors - invalid HTML but browsers handle it by closing outer anchor. // Per HTML5 spec, nested tags are invalid. Browsers handle them by closing // the outer anchor when the inner anchor is encountered: // text1 text2 // becomes effectively: // text1 text2 add_task(async function test_extractor_nested_anchors() { const { actor, cleanup } = await html`

An outer link. An inner link.

`; const result = await actor.getText(); const { text, links } = result; const lines = text .split("\n") .map(l => l.trim()) .filter(l => l.length); // Browser closes outer anchor when inner anchor is encountered, so we get two // separate anchors. Since both are inline elements within the same block (div), // they are extracted together on the same line. Assert.deepEqual( lines, [ "[An outer link.](https://example.com/outer-link)[An inner link.](https://example.com/inner-link)", ], "Nested anchors are parsed as separate inline anchors by the browser" ); Assert.deepEqual( links, ["https://example.com/outer-link", "https://example.com/inner-link"], "Both links are extracted from nested anchor structure" ); await cleanup(); });