// |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules const strings = { // WB1, WB2 "": [], // WB3 "\r\n": ["\r\n"], // WB3a, WB3b "\n": ["\n"], "\r": ["\r"], "\v": ["\v"], "\f": ["\f"], "\x85": ["\x85"], // WB3d " ": [" "], " ": [" "], // WB4 "\xAD": ["\xAD"], "\xAD\xAD": ["\xAD\xAD"], // WB5 "a": ["a"], "ab": ["ab"], // WB6, WB7 // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112) // "a:b": ["a:b"], "a·b": ["a·b"], "a.b": ["a.b"], "a'b": ["a'b"], // WB8 "1": ["1"], "12": ["12"], // WB9 "a1": ["a1"], // WB10 "1a": ["1a"], // WB11, WB12 "1,2": ["1,2"], "1;2": ["1;2"], "1.2": ["1.2"], "1'2": ["1'2"], // WB13a "a_": ["a_"], "1_": ["1_"], "__": ["__"], // WB13b "_a": ["_a"], "_1": ["_1"], // WB999 "\0": ["\0"], "?": ["?"], "??": ["?", "?"], }; function assertSegments(string, words) { let seg = segmenter.segment(string); let segments = [...seg]; // The computed segments match the expected value. assertEqArray(segments.map(({segment}) => segment), words); // |containing()| should return the same result. for (let expected of segments) { let {segment, index} = expected; for (let i = index; i < index + segment.length; ++i) { let actual = seg.containing(i); assertDeepEq(actual, expected); } } } let segmenter = new Intl.Segmenter("en", {granularity: "word"}); for (let [string, words] of Object.entries(strings)) { assertSegments(string, words); } // WB3, WB3a, WB3b and WB4 for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) { assertSegments(string + "\xAD", [string, "\xAD"]); assertSegments("\xAD" + string, ["\xAD", string]); } // WB3d and WB4 for (let string of [" ", " "]) { assertSegments(string + "\xAD", [string + "\xAD"]); assertSegments("\xAD" + string, ["\xAD", string]); } assertSegments(" \xAD ", [" \xAD", " "]); assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]); // WB5-WB13 and WB4 for (let string of [ // WB5 "a", "ab", // WB6, WB7 // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112) // "a:b", "a·b", "a.b", "a'b", // WB8 "1", "12", // WB9 "a1", // WB10 "1a", // WB11, WB12 "1,2", "1;2", "1.2", "1'2", // WB13a "a_", "1_", "__", // WB13b "_a", "_1", // WB999 "?", ]) { assertSegments(string + "\xAD", [string + "\xAD"]); assertSegments("\xAD" + string, ["\xAD", string]); assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]); assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]); } assertSegments("?\xAD?", ["?\xAD", "?"]); for (let string of [ // WB6, WB7 "a:b", "a·b", "a.b", "a'b", // WB11, WB12 "1,2", "1;2", "1.2", "1'2", ]) { let prefix = string.slice(0, -1); let suffix = string.slice(1); assertSegments(prefix, prefix.split("")); assertSegments(suffix, suffix.split("")); } // MidNum with ALetter assertSegments("a,b", ["a", ",", "b"]); assertSegments("a;b", ["a", ";", "b"]); // MidLetter with Numeric assertSegments("1:2", ["1", ":", "2"]); assertSegments("1·2", ["1", "·", "2"]); // MidNumLet with mixed ALetter and Numeric assertSegments("a.2", ["a", ".", "2"]); assertSegments("1.b", ["1", ".", "b"]); assertSegments("a'2", ["a", "'", "2"]); assertSegments("1'b", ["1", "'", "b"]); // MidNum with ExtendNumLet assertSegments("_,_", ["_", ",", "_"]); assertSegments("_;_", ["_", ";", "_"]); // MidLetter with ExtendNumLet assertSegments("_:_", ["_", ":", "_"]); assertSegments("_·_", ["_", "·", "_"]); // MidNumLet with ExtendNumLet assertSegments("_._", ["_", ".", "_"]); assertSegments("_'_", ["_", "'", "_"]); // CLDR has locale-dependent word segmentation for the "en-posix" locale. This // locale is currently not selectable, so the Latin-1 fast-paths don't need to // implement it. If one of the two below assertions ever fail, please update // the Latin-1 fast-paths for word segmentation to implement the "en-posix" // changes. assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en"); assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en"); // Locale-dependent word segmentation. { // https://en.wikipedia.org/wiki/Colon_(punctuation)#Abbreviation_mark let string = "Word:with:colon"; let english = new Intl.Segmenter("en", {granularity: "word"}); let svenska = new Intl.Segmenter("sv", {granularity: "word"}); // Three words with two separators in English. assertEq([...english.segment(string)].length, 5); // A single word in Swedish. assertEq([...svenska.segment(string)].length, 1); } if (typeof reportCompare === "function") reportCompare(0, 0);