/* Any copyright is dedicated to the Public Domain.
https://creativecommons.org/publicdomain/zero/1.0/ */
"use strict";
/**
* @import { BrowserTestUtils } from "../../../../../testing/mochitest/BrowserTestUtils/BrowserTestUtils.sys.mjs"
* @import { PageExtractorParent } from "../../PageExtractorParent.sys.mjs"
*/
add_task(async function test_dom_extractor_default_options() {
const { actor, cleanup } = await html`
Hello World
This is a paragraph
`;
is(
(await actor.getText()).text,
["Hello World", "This is a paragraph"].join("\n"),
"Text can be extracted from the page."
);
is(
(await actor.getReaderModeContent(true /* force */)).text,
"Hello World\nThis is a paragraph",
"Reader mode can extract page content."
);
Assert.deepEqual(
await actor.getReaderModeContent(),
{ text: "", links: [] },
"Empty result is returned on non-reader mode content."
);
return cleanup();
});
add_task(async function test_dom_extractor_sufficient_length_option() {
const { actor, cleanup } = await html`
Hello World
First paragraph.
Second paragraph.
`;
const header = "Hello World";
const headerAndP1 = ["Hello World", "First paragraph."].join("\n");
const allText = ["Hello World", "First paragraph.", "Second paragraph."].join(
"\n"
);
is(
(await actor.getText()).text,
allText,
"All text is returned with the default options."
);
const max = allText.length + 1;
const expectations = [
[length => length === 0, ""],
[length => length > 0 && length <= 12, header],
[length => length > 12 && length <= 29, headerAndP1],
[length => length > 29 && length <= max, allText],
];
for (let sufficientLength = 0; sufficientLength <= max; ++sufficientLength) {
let expectedValue;
for (const [predicate, value] of expectations) {
if (predicate(sufficientLength)) {
expectedValue = value;
}
}
is(
(await actor.getText({ sufficientLength })).text,
expectedValue,
`The text, given sufficientLength of ${sufficientLength}, matches the expectation.`
);
}
return cleanup();
});
add_task(
async function test_dom_extractor_ignores_hidden_and_collapsed_nodes() {
const { actor, cleanup } = await html`
Visible Title
Visible paragraph
Hidden via [hidden]
Hidden via display:none
Hidden via visibility:hidden
Hidden via opacity:0
Inline text within zero-sized block container
Inline text within zero-width (height>0) block container
Inline text within zero-height (width>0) block container
Block text within zero-sized inline container
Hidden container outer text
Hidden container inner text
Visible container outer text (hidden descendant)
Hidden child text in visible container
Hidden inline outer text
Hidden inline inner text
Visible inline outer text (hidden descendant)
Hidden block descendant text
Summary is visible
Hidden inside closed details
Text node directly under closed details (hidden)
`;
const expected = [
"Visible Title",
"Visible paragraph",
"Block text within zero-sized inline container",
"Visible container outer text (hidden descendant)",
"Visible inline outer text (hidden descendant)",
"Summary is visible",
].join("\n");
is(
(await actor.getText()).text,
expected,
"The extractor returns only visible text."
);
return cleanup();
}
);
add_task(async function test_dom_extractor_inline_batching() {
const { actor, cleanup } = await html`
This is a simple section.
`;
is(
(await actor.getText()).text,
[
"This is a simple section.",
"[This entire](http://example.com/) section continues in a batch.",
].join("\n"),
"Inline content is grouped within block elements."
);
return cleanup();
});
// Tests comprehensive anchor element handling per the HTML specification.
// https://html.spec.whatwg.org/multipage/links.html
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/a
add_task(async function test_dom_extractor_link_anchors() {
const { actor, cleanup } = await html`
Comprehensive Anchor Test
Reach us via Email.
New Tab Link
Download Attribute
Card Title
Card description.
Read bold text and italic text inside.
`;
const result = await actor.getText();
const { text, links } = result;
const actualLines = text
.split("\n")
.map(line => line.trim())
.filter(line => !!line.length);
Assert.deepEqual(
actualLines,
[
"Comprehensive Anchor Test",
"Reach us via [Email](mailto:user@mozilla.org).",
"[New Tab Link](https://example.com/external)",
"[Download Attribute](https://example.com/files/report.pdf)",
"Card Title",
"Card description.",
"[Read bold text and italic text inside.](https://example.com/mixed)",
"[Company Logo](https://example.com/with-alt)",
],
"Text output matches expected markdown format with various anchor types"
);
Assert.deepEqual(
links,
[
"mailto:user@mozilla.org",
"https://example.com/external",
"https://example.com/files/report.pdf",
"https://example.com/card",
"https://example.com/mixed",
"https://example.com/no-alt",
"https://example.com/with-alt",
],
"Links array contains all extracted href values"
);
await cleanup();
});
// Test that empty href resolves to current page URL via .href property
add_task(async function test_dom_extractor_empty_href() {
const { actor, cleanup } = await html`
Empty Href Link
`;
const result = await actor.getText();
const { text, links } = result;
// Empty href resolves to current page URL via .href property
Assert.ok(
text.includes("[Empty Href Link](http"),
`Empty href formatted as markdown with resolved URL: ${text}`
);
Assert.equal(links.length, 1, "One link extracted");
Assert.ok(
links[0].startsWith("http"),
`Empty href resolves to page URL: ${links[0]}`
);
await cleanup();
});
// Original test case from Bug 1995618 - validates the core requirement
add_task(async function test_dom_extractor_links() {
const { actor, cleanup } = await html`
Example of Links
`;
const { text, links } = await actor.getText();
const lines = text.split("\n").filter(l => l.trim());
Assert.deepEqual(
lines,
[
"Example of Links",
"Here is the [First link](https://example.com/first)",
"Now this is an [external link](https://example.com/link)",
],
"Text output matches expected markdown format"
);
Assert.deepEqual(
links,
["https://example.com/first", "https://example.com/link"],
"Links array contains extracted href values"
);
return cleanup();
});
add_task(async function test_dom_extractor_inline_block_styling() {
const { actor, cleanup } = await html`
Bare text is sent in a batch.
Inline text at the root is sent in a batch.
Display "block"
overrides the inline designation.
`;
is(
(await actor.getText()).text,
[
"Bare text is sent in a batch.",
"Inline text at the root is sent in a batch.",
'Display "block"',
"overrides the inline designation.",
].join("\n"),
"Inline and block styling are extracted as separate blocks."
);
return cleanup();
});
add_task(async function test_extractor_edge_cases() {
const { actor, cleanup } = await html`
Link with [Brackets]
Link with (Parens)
URL with Parens
Multiline Href
Card Title
Description
`;
const result = await actor.getText();
const { text, links } = result;
const lines = text
.split("\n")
.map(l => l.trim())
.filter(l => l.length);
// Assert on the entire text output - markdown-formatted with escaped special characters
// and block-level elements extracted as separate lines
Assert.deepEqual(
lines,
[
"[Link with \\[Brackets\\]](https://example.com/1) [Link with \\(Parens\\)](https://example.com/2) [URL with Parens](https://en.wikipedia.org/wiki/HTML_%28standard%29)",
"[Multiline Href](https://example.com/messy)",
"Card Title",
"Description",
],
"Text output matches expected markdown format with escaped characters"
);
// Assert on the entire links array - .href provides normalized absolute URLs
Assert.deepEqual(
links,
[
"https://example.com/1",
"https://example.com/2",
"https://en.wikipedia.org/wiki/HTML_(standard)",
"https://example.com/messy",
"https://example.com/card",
],
"Links array contains all extracted href values"
);
await cleanup();
});
// Test nested anchors - invalid HTML but browsers handle it by closing outer anchor.
// Per HTML5 spec, nested tags are invalid. Browsers handle them by closing
// the outer anchor when the inner anchor is encountered:
// text1text2
// becomes effectively:
// text1text2
add_task(async function test_extractor_nested_anchors() {
const { actor, cleanup } = await html`
`;
const result = await actor.getText();
const { text, links } = result;
const lines = text
.split("\n")
.map(l => l.trim())
.filter(l => l.length);
// Browser closes outer anchor when inner anchor is encountered, so we get two
// separate anchors. Since both are inline elements within the same block (div),
// they are extracted together on the same line.
Assert.deepEqual(
lines,
[
"[An outer link.](https://example.com/outer-link)[An inner link.](https://example.com/inner-link)",
],
"Nested anchors are parsed as separate inline anchors by the browser"
);
Assert.deepEqual(
links,
["https://example.com/outer-link", "https://example.com/inner-link"],
"Both links are extracted from nested anchor structure"
);
await cleanup();
});