// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). // Various collation-related algorithms and constants in this file are // adapted from ICU4C and, therefore, are subject to the ICU license as // described in LICENSE. //! This module holds the `Collator` struct whose `compare_impl()` contains //! the comparison of collation element sequences. use alloc::collections::VecDeque; use alloc::vec::Vec; use crate::elements::CharacterAndClassAndTrieValue; use crate::elements::CollationElement32; use crate::elements::Tag; use crate::elements::BACKWARD_COMBINING_MARKER; use crate::elements::CE_BUFFER_SIZE; use crate::elements::FALLBACK_CE32; use crate::elements::HANGUL_N_COUNT; use crate::elements::HANGUL_S_BASE; use crate::elements::HANGUL_S_COUNT; use crate::elements::HANGUL_T_COUNT; use crate::elements::IDENTICAL_PREFIX_HANGUL_MARKER_CE32; use crate::elements::NON_ROUND_TRIP_MARKER; use crate::elements::{ char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32, HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY, NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK, }; use crate::options::CollatorOptionsBitField; use crate::options::{ AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength, }; use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType}; use crate::provider::CollationData; use crate::provider::CollationDiacritics; use crate::provider::CollationDiacriticsV1; use crate::provider::CollationJamo; use crate::provider::CollationJamoV1; use crate::provider::CollationMetadataV1; use crate::provider::CollationReordering; use crate::provider::CollationReorderingV1; use crate::provider::CollationRootV1; use crate::provider::CollationSpecialPrimariesV1; use crate::provider::CollationSpecialPrimariesValidated; use crate::provider::CollationTailoringV1; use core::cmp::Ordering; use core::convert::{Infallible, TryFrom}; use icu_collections::codepointtrie::AbstractCodePointTrie; use icu_collections::codepointtrie::CharsWithTrieDefaultForAsciiEx; use icu_collections::codepointtrie::CharsWithTrieEx; #[cfg(feature = "serde")] use icu_collections::codepointtrie::CodePointTrie; #[cfg(not(feature = "serde"))] use icu_collections::codepointtrie::FastCodePointTrie; #[cfg(feature = "latin1")] use icu_collections::codepointtrie::Latin1CharsWithTrieEx; use icu_collections::codepointtrie::TypedCodePointTrie; use icu_collections::codepointtrie::WithTrie; use icu_normalizer::provider::DecompositionData; use icu_normalizer::provider::DecompositionTables; use icu_normalizer::provider::NormalizerNfdDataV1; use icu_normalizer::provider::NormalizerNfdTablesV1; use icu_provider::marker::ErasedMarker; use icu_provider::prelude::*; use smallvec::SmallVec; use utf16_iter::Utf16CharsWithTrieEx; use utf8_iter::Utf8CharsWithTrieDefaultForAsciiEx; use utf8_iter::Utf8CharsWithTrieEx; use zerovec::ule::AsULE; #[cfg(feature = "serde")] type NormTrie<'trie> = CodePointTrie<'trie, u32>; #[cfg(not(feature = "serde"))] type NormTrie<'trie> = FastCodePointTrie<'trie, u32>; // Special sort key bytes for all levels. const LEVEL_SEPARATOR_BYTE: u8 = 1; /// Merge-sort-key separator. /// /// Same as the unique primary and identical-level weights of U+FFFE. Must not /// be used as primary compression low terminator. Otherwise usable. const MERGE_SEPARATOR: char = '\u{fffe}'; const MERGE_SEPARATOR_BYTE: u8 = 2; const MERGE_SEPARATOR_PRIMARY: u32 = 0x02000000; /// Primary compression low terminator, must be greater than [`MERGE_SEPARATOR_BYTE`]. /// /// Reserved value in primary second byte if the lead byte is compressible. /// Otherwise usable in all CE weight bytes. const PRIMARY_COMPRESSION_LOW_BYTE: u8 = 3; /// Primary compression high terminator. /// /// Reserved value in primary second byte if the lead byte is compressible. /// Otherwise usable in all CE weight bytes. const PRIMARY_COMPRESSION_HIGH_BYTE: u8 = 0xff; /// Default secondary/tertiary weight lead byte. const COMMON_BYTE: u8 = 5; const COMMON_WEIGHT16: u16 = 0x0500; // Internal flags for sort key generation const PRIMARY_LEVEL_FLAG: u8 = 0x01; const SECONDARY_LEVEL_FLAG: u8 = 0x02; const CASE_LEVEL_FLAG: u8 = 0x04; const TERTIARY_LEVEL_FLAG: u8 = 0x08; const QUATERNARY_LEVEL_FLAG: u8 = 0x10; const LEVEL_MASKS: [u8; Strength::Identical as usize + 1] = [ PRIMARY_LEVEL_FLAG, PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG, PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG, PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG, 0, 0, 0, PRIMARY_LEVEL_FLAG | SECONDARY_LEVEL_FLAG | TERTIARY_LEVEL_FLAG | QUATERNARY_LEVEL_FLAG, ]; // Internal constants for indexing into the below compression configurations const WEIGHT_LOW: usize = 0; const WEIGHT_MIDDLE: usize = 1; const WEIGHT_HIGH: usize = 2; const WEIGHT_MAX_COUNT: usize = 3; // Secondary level: Compress up to 33 common weights as 05..25 or 25..45. const SEC_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21]; // Case level, lowerFirst: Compress up to 7 common weights as 1..7 or 7..13. const CASE_LOWER_FIRST_COMMON: [u8; 4] = [1, 7, 13, 7]; // Case level, upperFirst: Compress up to 13 common weights as 3..15. const CASE_UPPER_FIRST_COMMON: [u8; 4] = [3, 0 /* unused */, 15, 13]; // Tertiary level only (no case): Compress up to 97 common weights as 05..65 or 65..C5. const TER_ONLY_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x60, COMMON_BYTE + 0xc0, 0x61]; // Tertiary with case, lowerFirst: Compress up to 33 common weights as 05..25 or 25..45. const TER_LOWER_FIRST_COMMON: [u8; 4] = [COMMON_BYTE, COMMON_BYTE + 0x20, COMMON_BYTE + 0x40, 0x21]; // Tertiary with case, upperFirst: Compress up to 33 common weights as 85..A5 or A5..C5. const TER_UPPER_FIRST_COMMON: [u8; 4] = [ COMMON_BYTE + 0x80, COMMON_BYTE + 0x80 + 0x20, COMMON_BYTE + 0x80 + 0x40, 0x21, ]; const QUAT_COMMON: [u8; 4] = [0x1c, 0x1c + 0x70, 0x1c + 0xe0, 0x71]; const QUAT_SHIFTED_LIMIT_BYTE: u8 = QUAT_COMMON[WEIGHT_LOW] - 1; // 0x1b // Do not use byte values 0, 1, 2 because they are separators in sort keys. const SLOPE_MIN: i32 = 3; const SLOPE_MAX: i32 = 0xff; const SLOPE_MIDDLE: i32 = 0x81; const SLOPE_TAIL_COUNT: i32 = SLOPE_MAX - SLOPE_MIN + 1; const SLOPE_SINGLE: i32 = 80; const SLOPE_LEAD_2: i32 = 42; const SLOPE_LEAD_3: i32 = 3; // The difference value range for single-byters. const SLOPE_REACH_POS_1: i32 = SLOPE_SINGLE; const SLOPE_REACH_NEG_1: i32 = -SLOPE_SINGLE; // The difference value range for double-byters. const SLOPE_REACH_POS_2: i32 = SLOPE_LEAD_2 * SLOPE_TAIL_COUNT + (SLOPE_LEAD_2 - 1); const SLOPE_REACH_NEG_2: i32 = -SLOPE_REACH_POS_2 - 1; // The difference value range for 3-byters. const SLOPE_REACH_POS_3: i32 = SLOPE_LEAD_3 * SLOPE_TAIL_COUNT * SLOPE_TAIL_COUNT + (SLOPE_LEAD_3 - 1) * SLOPE_TAIL_COUNT + (SLOPE_TAIL_COUNT - 1); const SLOPE_REACH_NEG_3: i32 = -SLOPE_REACH_POS_3 - 1; // The lead byte start values. const SLOPE_START_POS_2: i32 = SLOPE_MIDDLE + SLOPE_SINGLE + 1; const SLOPE_START_POS_3: i32 = SLOPE_START_POS_2 + SLOPE_LEAD_2; const SLOPE_START_NEG_2: i32 = SLOPE_MIDDLE + SLOPE_REACH_NEG_1; const SLOPE_START_NEG_3: i32 = SLOPE_START_NEG_2 - SLOPE_LEAD_2; struct AnyQuaternaryAccumulator(u32); impl AnyQuaternaryAccumulator { #[inline(always)] pub fn new() -> Self { AnyQuaternaryAccumulator(0) } #[inline(always)] pub fn accumulate(&mut self, non_primary: NonPrimary) { self.0 |= non_primary.bits() } #[inline(always)] pub fn has_quaternary(&self) -> bool { self.0 & u32::from(QUATERNARY_MASK) != 0 } } /// `true` iff `i` is greater or equal to `start` and less or equal /// to `end`. #[inline(always)] fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool { i.wrapping_sub(start) <= (end - start) } /// Finds the identical prefix of `left` and `right` containing /// Latin1. /// /// Returns the identical prefix, the part of `left` after the /// prefix, and the part of `right` after the prefix. /// /// ✨ *Enabled with the `latin1` Cargo feature.* #[cfg(feature = "latin1")] fn split_prefix_latin1<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) { let i = left .iter() .zip(right.iter()) .take_while(|(l, r)| l == r) .count(); if let Some((head, left_tail)) = left.split_at_checked(i) { if let Some(right_tail) = right.get(i..) { return (head, left_tail, right_tail); } } (&[], left, right) } /// Finds the identical prefix of `left` containing Latin1 /// and `right` containing potentially ill-formed UTF-16. /// /// Returns the identical prefix, the part of `left` after the /// prefix, and the part of `right` after the prefix. /// /// ✨ *Enabled with the `latin1` Cargo feature.* #[cfg(feature = "latin1")] fn split_prefix_latin1_utf16<'a, 'b>( left: &'a [u8], right: &'b [u16], ) -> (&'a [u8], &'a [u8], &'b [u16]) { let i = left .iter() .zip(right.iter()) .take_while(|(l, r)| u16::from(**l) == **r) .count(); if let Some((head, left_tail)) = left.split_at_checked(i) { if let Some(right_tail) = right.get(i..) { return (head, left_tail, right_tail); } } (&[], left, right) } /// Finds the identical prefix of `left` and `right` containing /// potentially ill-formed UTF-16, while avoiding splitting a /// well-formed surrogate pair. In case of ill-formed /// UTF-16, the prefix is not guaranteed to be maximal. /// /// Returns the identical prefix, the part of `left` after the /// prefix, and the part of `right` after the prefix. fn split_prefix_u16<'a, 'b>( left: &'a [u16], right: &'b [u16], ) -> (&'a [u16], &'a [u16], &'b [u16]) { let mut i = left .iter() .zip(right.iter()) .take_while(|(l, r)| l == r) .count(); if i != 0 { if let Some(&last) = left.get(i.wrapping_sub(1)) { if in_inclusive_range16(last, 0xD800, 0xDBFF) { i -= 1; } if let Some((head, left_tail)) = left.split_at_checked(i) { if let Some(right_tail) = right.get(i..) { return (head, left_tail, right_tail); } } } } (&[], left, right) } /// Finds the identical prefix of `left` and `right` containing /// potentially ill-formed UTF-8, while avoiding splitting a UTF-8 /// byte sequence. In case of ill-formed UTF-8, the prefix is /// not guaranteed to be maximal. /// /// Returns the identical prefix, the part of `left` after the /// prefix, and the part of `right` after the prefix. fn split_prefix_u8<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) { let mut i = left .iter() .zip(right.iter()) .take_while(|(l, r)| l == r) .count(); if i != 0 { // Tails must not start with a UTF-8 continuation // byte unless it's the first byte of the original // slice. // First, left and right differ, but since they // are the same afterwards, one of them needs checking // only once. if let Some(right_first) = right.get(i) { if (right_first & 0b1100_0000) == 0b1000_0000 { i -= 1; } } while i != 0 { if let Some(left_first) = left.get(i) { if (left_first & 0b1100_0000) == 0b1000_0000 { i -= 1; continue; } } break; } if let Some((head, left_tail)) = left.split_at_checked(i) { if let Some(right_tail) = right.get(i..) { return (head, left_tail, right_tail); } } } (&[], left, right) } /// Finds the identical prefix of `left` and `right` containing /// guaranteed well-format UTF-8. /// /// Returns the identical prefix, the part of `left` after the /// prefix, and the part of `right` after the prefix. fn split_prefix<'a, 'b>(left: &'a str, right: &'b str) -> (&'a str, &'a str, &'b str) { let left_bytes = left.as_bytes(); let right_bytes = right.as_bytes(); let mut i = left_bytes .iter() .zip(right_bytes.iter()) .take_while(|(l, r)| l == r) .count(); if i != 0 { // Tails must not start with a UTF-8 continuation // byte. // Since the inputs are valid UTF-8, the first byte // of either input slice cannot be a contination slice, // so we may rely on finding a lead byte when walking // backwards. // Since the inputs are valid UTF-8, if a tail starts // with a continuation, both tails must start with a // continuation, since the most recent lead byte must // be equal, so the difference is within valid UTF-8 // sequences of equal length. // Therefore, it's sufficient to examine only one of // the sides. loop { if let Some(left_first) = left_bytes.get(i) { if (left_first & 0b1100_0000) == 0b1000_0000 { i -= 1; continue; } } break; } // The methods below perform useless UTF-8 boundary checks, // since we just checked. However, avoiding `unsafe` to // make this code easier to audit. if let Some((head, left_tail)) = left.split_at_checked(i) { if let Some(right_tail) = right.get(i..) { return (head, left_tail, right_tail); } } } ("", left, right) } /// Holder struct for payloads that are locale-dependent. (For code /// reuse between owned and borrowed cases.) #[derive(Debug)] struct LocaleSpecificDataHolder { tailoring: Option>, diacritics: DataPayload, reordering: Option>, merged_options: CollatorOptionsBitField, lithuanian_dot_above: bool, } icu_locale_core::preferences::define_preferences!( /// The preferences for collation. /// /// # Preferences /// /// Examples for using the different preferences below can be found in the [crate-level docs](crate). /// /// ## Case First /// /// See the [spec](https://www.unicode.org/reports/tr35/tr35-collation.html#Case_Parameters). /// This is the BCP47 key `kf`. Three possibilities: [`CollationCaseFirst::False`] (default, /// except for Danish and Maltese), [`CollationCaseFirst::Lower`], and [`CollationCaseFirst::Upper`] /// (default for Danish and Maltese). /// /// ## Numeric /// /// This is the BCP47 key `kn`. When set to [`CollationNumericOrdering::True`], any sequence of decimal /// digits (General_Category = Nd) is sorted at the primary level according to the /// numeric value. The default is [`CollationNumericOrdering::False`]. [Copy] CollatorPreferences, { /// The collation type. This corresponds to the `-u-co` BCP-47 tag. collation_type: CollationType, /// Treatment of case. (Large and small kana differences are treated as case differences.) /// This corresponds to the `-u-kf` BCP-47 tag. case_first: CollationCaseFirst, /// When set to `True`, any sequence of decimal digits is sorted at a primary level according /// to the numeric value. /// This corresponds to the `-u-kn` BPC-47 tag. numeric_ordering: CollationNumericOrdering } ); impl LocaleSpecificDataHolder { /// The constructor code reused between owned and borrowed cases. fn try_new_unstable_internal( provider: &D, prefs: CollatorPreferences, options: CollatorOptions, ) -> Result where D: DataProvider + DataProvider + DataProvider + DataProvider + ?Sized, { let marker_attributes = prefs .collation_type .as_ref() // all collation types are valid marker attributes .map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str())) .unwrap_or_default(); let data_locale = CollationTailoringV1::make_locale(prefs.locale_preferences); let req = DataRequest { id: DataIdentifierBorrowed::for_marker_attributes_and_locale( marker_attributes, &data_locale, ), metadata: { let mut metadata = DataRequestMetadata::default(); metadata.silent = true; metadata }, }; let fallback_req = DataRequest { id: DataIdentifierBorrowed::for_marker_attributes_and_locale( Default::default(), &data_locale, ), ..Default::default() }; let metadata_payload: DataPayload = provider .load(req) .or_else(|_| provider.load(fallback_req))? .payload; let metadata = metadata_payload.get(); let tailoring: Option> = if metadata.tailored() { Some( provider .load(req) .or_else(|_| provider.load(fallback_req))? .payload, ) } else { None }; let reordering: Option> = if metadata.reordering() { Some( provider .load(req) .or_else(|_| provider.load(fallback_req))? .payload, ) } else { None }; if let Some(reordering) = &reordering { if reordering.get().reorder_table.len() != 256 { return Err(DataError::custom("invalid").with_marker(CollationReorderingV1::INFO)); } } let tailored_diacritics = metadata.tailored_diacritics(); let diacritics: DataPayload = provider .load(if tailored_diacritics { req } else { Default::default() })? .payload; if tailored_diacritics { // In the tailored case we accept a shorter table in which case the tailoring is // responsible for supplying the missing values in the trie. // As of June 2022, none of the collations actually use a shortened table. // Vietnamese and Ewe load a full-length alternative table and the rest use // the default one. if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT { return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO)); } } else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT { return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO)); } let mut altered_defaults = CollatorOptionsBitField::default(); if metadata.alternate_shifted() { altered_defaults.set_alternate_handling(Some(AlternateHandling::Shifted)); } if metadata.backward_second_level() { altered_defaults.set_backward_second_level(Some(true)); } altered_defaults.set_case_first(Some(metadata.case_first())); altered_defaults.set_max_variable(Some(metadata.max_variable())); let mut merged_options = CollatorOptionsBitField::from(options); merged_options.set_case_first(prefs.case_first); merged_options.set_numeric_from_enum(prefs.numeric_ordering); merged_options.set_defaults(altered_defaults); Ok(LocaleSpecificDataHolder { tailoring, diacritics, merged_options, reordering, lithuanian_dot_above: metadata.lithuanian_dot_above(), }) } } /// Compares strings according to culturally-relevant ordering. #[derive(Debug)] pub struct Collator { special_primaries: DataPayload>>, root: DataPayload, tailoring: Option>, jamo: DataPayload, diacritics: DataPayload, options: CollatorOptionsBitField, reordering: Option>, decompositions: DataPayload, tables: DataPayload, lithuanian_dot_above: bool, } impl Collator { /// Constructs a borrowed version of this type for more efficient querying. pub fn as_borrowed(&self) -> CollatorBorrowed<'_> { CollatorBorrowed { special_primaries: self.special_primaries.get(), root: self.root.get(), tailoring: if let Some(t) = self.tailoring.as_ref() { t.get() } else { self.root.get() }, jamo: self.jamo.get(), diacritics: self.diacritics.get(), options: self.options, reordering: self.reordering.as_ref().map(|s| s.get()), decompositions: self.decompositions.get(), tables: self.tables.get(), lithuanian_dot_above: self.lithuanian_dot_above, } } /// Creates `CollatorBorrowed` for the given locale and options from compiled data. #[cfg(feature = "compiled_data")] pub fn try_new( prefs: CollatorPreferences, options: CollatorOptions, ) -> Result, DataError> { CollatorBorrowed::try_new(prefs, options) } icu_provider::gen_buffer_data_constructors!( (prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError, functions: [ try_new: skip, try_new_with_buffer_provider, try_new_unstable, Self ] ); #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)] pub fn try_new_unstable( provider: &D, prefs: CollatorPreferences, options: CollatorOptions, ) -> Result where D: DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + ?Sized, { Self::try_new_unstable_internal( provider, provider.load(Default::default())?.payload, provider.load(Default::default())?.payload, provider.load(Default::default())?.payload, provider.load(Default::default())?.payload, provider.load(Default::default())?.payload, prefs, options, ) } #[expect(clippy::too_many_arguments)] fn try_new_unstable_internal( provider: &D, root: DataPayload, decompositions: DataPayload, tables: DataPayload, jamo: DataPayload, special_primaries: DataPayload, prefs: CollatorPreferences, options: CollatorOptions, ) -> Result where D: DataProvider + DataProvider + DataProvider + DataProvider + DataProvider + ?Sized, { let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; // TODO: redesign Korean search collation handling if jamo.get().ce32s.len() != JAMO_COUNT { return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO)); } // `variant_count` isn't stable yet: // https://github.com/rust-lang/rust/issues/73662 if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) { return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)); } let special_primaries = special_primaries.map_project(|csp, _| { let compressible_bytes = (csp.last_primaries.len() == MaxVariable::Currency as usize + 16) .then(|| { csp.last_primaries .as_maybe_borrowed()? .as_ule_slice() .get((MaxVariable::Currency as usize)..)? .try_into() .ok() }) .flatten() .unwrap_or( CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK, ); CollationSpecialPrimariesValidated { last_primaries: csp.last_primaries.truncated(MaxVariable::Currency as usize), numeric_primary: csp.numeric_primary, compressible_bytes, } }); Ok(Collator { special_primaries, root, tailoring: locale_dependent.tailoring, jamo, diacritics: locale_dependent.diacritics, options: locale_dependent.merged_options, reordering: locale_dependent.reordering, decompositions, tables, lithuanian_dot_above: locale_dependent.lithuanian_dot_above, }) } } macro_rules! quick_primary_compare { ($left_primary:ident, $right_primary:ident, $variable_top:ident, $self:ident, ) => { if ($left_primary != $right_primary) && ($left_primary != 0) && ($right_primary != 0) && !($left_primary < $variable_top && $left_primary > MERGE_SEPARATOR_PRIMARY) && !($right_primary < $variable_top && $right_primary > MERGE_SEPARATOR_PRIMARY) { if let Some(reordering) = &$self.reordering { $left_primary = reordering.reorder($left_primary); $right_primary = reordering.reorder($right_primary); } if $left_primary < $right_primary { return Ordering::Less; } return Ordering::Greater; } }; } macro_rules! hangul_syllable_compare { ($left_hangul_offset:ident, $right_hangul_offset:ident, ) => { let left_l = $left_hangul_offset / HANGUL_N_COUNT; let right_l = $right_hangul_offset / HANGUL_N_COUNT; if left_l != right_l { // We don't really support jamo tailoring, so let's // compare the jamo directly. if left_l < right_l { return Ordering::Less; } return Ordering::Greater; } let left_v = ($left_hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT; let right_v = ($right_hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT; if left_v != right_v { // We don't really support jamo tailoring, so let's // compare the jamo directly. if left_v < right_v { return Ordering::Less; } return Ordering::Greater; } let left_t = $left_hangul_offset % HANGUL_T_COUNT; let right_t = $right_hangul_offset % HANGUL_T_COUNT; // If either syllable is a two-jamo syllable, we fall through. if left_t != right_t && left_t != 0 && right_t != 0 { // We don't really support jamo tailoring, so let's // compare the jamo directly. if left_t < right_t { return Ordering::Less; } return Ordering::Greater; } }; } macro_rules! compare { ($(#[$meta:meta])*, $compare:ident, $left_slice:ty, $right_slice:ty, $split_prefix:ident, $left_to_iter:ident, $right_to_iter:ident, $self:ident, $left_tail:ident, $right_tail:ident, $primary_check:block, $variable_top:ident, ) => { $(#[$meta])* pub fn $compare(&$self, left: &$left_slice, right: &$right_slice) -> Ordering { let (head, $left_tail, $right_tail) = $split_prefix(left, right); if $left_tail.is_empty() && $right_tail.is_empty() { return Ordering::Equal; } let $variable_top = $self.variable_top(); if head.is_empty() { $primary_check } let norm_trie = $self.norm_trie(); let ret = $self.compare_impl($left_tail.$left_to_iter(norm_trie), $right_tail.$right_to_iter(norm_trie), head.$left_to_iter(norm_trie), $variable_top); if $self.options.strength() == Strength::Identical && ret == Ordering::Equal { // We don't need to remove the leading U+0000, because it compares equal anyway. return icu_normalizer::new_decomposition($left_tail.$left_to_iter(norm_trie), $self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }).cmp( icu_normalizer::new_decomposition($right_tail.$right_to_iter(norm_trie), $self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }), ); } ret } } } /// Compares strings according to culturally-relevant ordering, /// borrowed version. #[derive(Debug)] pub struct CollatorBorrowed<'a> { special_primaries: &'a CollationSpecialPrimariesValidated<'a>, root: &'a CollationData<'a>, tailoring: &'a CollationData<'a>, jamo: &'a CollationJamo<'a>, diacritics: &'a CollationDiacritics<'a>, options: CollatorOptionsBitField, reordering: Option<&'a CollationReordering<'a>>, decompositions: &'a DecompositionData<'a>, tables: &'a DecompositionTables<'a>, lithuanian_dot_above: bool, } impl CollatorBorrowed<'static> { /// Creates a collator for the given locale and options from compiled data. #[cfg(feature = "compiled_data")] pub fn try_new( prefs: CollatorPreferences, options: CollatorOptions, ) -> Result { // These are assigned to locals in order to keep the code after these assignments // copypaste-compatible with `Collator::try_new_unstable_internal`. let provider = &crate::provider::Baked; let decompositions = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1; let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1; let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1; let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1; let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; // TODO: redesign Korean search collation handling const _: () = assert!( crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1 .ce32s .as_slice() .len() == JAMO_COUNT ); // `variant_count` isn't stable yet: // https://github.com/rust-lang/rust/issues/73662 const _: () = assert!( crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .last_primaries .as_slice() .len() > (MaxVariable::Currency as usize) ); let special_primaries = const { &CollationSpecialPrimariesValidated { last_primaries: zerovec::ZeroSlice::from_ule_slice( crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .last_primaries .as_slice() .as_ule_slice() .split_at(MaxVariable::Currency as usize) .0, ) .as_zerovec(), numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .numeric_primary, compressible_bytes: { const C: &[::ULE] = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .last_primaries .as_slice() .as_ule_slice(); if C.len() == MaxVariable::Currency as usize + 16 { let i = MaxVariable::Currency as usize; #[allow(clippy::indexing_slicing)] // protected, const &[ C[i], C[i + 1], C[i + 2], C[i + 3], C[i + 4], C[i + 5], C[i + 6], C[i + 7], C[i + 8], C[i + 9], C[i + 10], C[i + 11], C[i + 12], C[i + 13], C[i + 14], C[i + 15], ] } else { CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK } }, } }; // Attribute belongs closer to `unwrap`, but // https://github.com/rust-lang/rust/issues/15701 #[expect(clippy::unwrap_used)] Ok(CollatorBorrowed { special_primaries, root, // Unwrap is OK, because we know we have the baked provider. tailoring: if let Some(s) = locale_dependent.tailoring { s.get_static().unwrap() } else { root }, jamo, // Unwrap is OK, because we know we have the baked provider. diacritics: locale_dependent.diacritics.get_static().unwrap(), options: locale_dependent.merged_options, // Unwrap is OK, because we know we have the baked provider. reordering: locale_dependent.reordering.map(|s| s.get_static().unwrap()), decompositions, tables, lithuanian_dot_above: locale_dependent.lithuanian_dot_above, }) } /// Cheaply converts a [`CollatorBorrowed<'static>`] into a [`Collator`]. /// /// Note: Due to branching and indirection, using [`Collator`] might inhibit some /// compile-time optimizations that are possible with [`CollatorBorrowed`]. pub const fn static_to_owned(self) -> Collator { Collator { special_primaries: DataPayload::from_static_ref(self.special_primaries), root: DataPayload::from_static_ref(self.root), tailoring: Some(DataPayload::from_static_ref(self.tailoring)), jamo: DataPayload::from_static_ref(self.jamo), diacritics: DataPayload::from_static_ref(self.diacritics), options: self.options, reordering: if let Some(s) = self.reordering { // `map` not available in const context Some(DataPayload::from_static_ref(s)) } else { None }, decompositions: DataPayload::from_static_ref(self.decompositions), tables: DataPayload::from_static_ref(self.tables), lithuanian_dot_above: self.lithuanian_dot_above, } } } macro_rules! collation_elements { ($self:expr, $chars:expr, $numeric_primary:expr) => {{ let jamo = <&[::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice()); let jamo = jamo.unwrap(); CollationElements::new( $chars, $self.root, $self.tailoring, jamo, &$self.diacritics.secondaries, $self.tables, $numeric_primary, $self.lithuanian_dot_above, ) }}; } impl<'data> CollatorBorrowed<'data> { /// The resolved options showing how the default options, the requested options, /// and the options from locale data were combined. pub fn resolved_options(&self) -> ResolvedCollatorOptions { self.options.into() } fn norm_trie(&self) -> &'data NormTrie<'data> { #[allow(clippy::useless_conversion)] <&NormTrie<'data>>::try_from(&self.decompositions.trie) .unwrap_or_else(|_| unreachable!("Incompatible data")) } compare!( /// Compare guaranteed well-formed UTF-8 slices. , compare, str, str, split_prefix, chars_with_trie_default_for_ascii, chars_with_trie_default_for_ascii, self, left_tail, right_tail, { // Not macroized: Copy and paste to the other UTF-8 case below. let tailoring_trie = &self.tailoring.trie; if let Some((left_c, left_u32)) = left_tail.chars_with_trie(tailoring_trie).next() { // Logically, there's a bunch of stuff we could do from the left side alone to determine // that reading from the right side at all or reading from the right trie is useless. // Doing that seems to be a pessimization, at least in the absence of PGO. if let Some((right_c, right_u32)) = right_tail.chars_with_trie(tailoring_trie).next() { let left_ce32 = CollationElement32::new(left_u32); let right_ce32 = CollationElement32::new(right_u32); if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) { if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) { quick_primary_compare!(left_primary, right_primary, variable_top, self,); } } // Try Hangul. (At least in the absence of PGO, it's better _not_ to put // this into an `else` branch of the `left_primary` `if`.) let right_hangul_offset = u32::from(right_c).wrapping_sub(HANGUL_S_BASE); if right_hangul_offset < HANGUL_S_COUNT { let left_hangul_offset = u32::from(left_c).wrapping_sub(HANGUL_S_BASE); if left_hangul_offset < HANGUL_S_COUNT { hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,); } } } } // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. }, variable_top, ); compare!( /// Compare potentially ill-formed UTF-8 slices. Ill-formed input is compared /// as if errors had been replaced with REPLACEMENT CHARACTERs according /// to the WHATWG Encoding Standard. , compare_utf8, [u8], [u8], split_prefix_u8, chars_with_trie_default_for_ascii, chars_with_trie_default_for_ascii, self, left_tail, right_tail, { // Direct copypaste from the `str` case. let tailoring_trie = &self.tailoring.trie; if let Some((left_c, left_u32)) = left_tail.chars_with_trie(tailoring_trie).next() { // Logically, there's a bunch of stuff we could do from the left side alone to determine // that reading from the right side at all or reading from the right trie is useless. // Doing that seems to be a pessimization, at least in the absence of PGO. if let Some((right_c, right_u32)) = right_tail.chars_with_trie(tailoring_trie).next() { let left_ce32 = CollationElement32::new(left_u32); let right_ce32 = CollationElement32::new(right_u32); if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) { if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) { quick_primary_compare!(left_primary, right_primary, variable_top, self,); } } // Try Hangul. (At least in the absence of PGO, it's better _not_ to put // this into an `else` branch of the `left_primary` `if`.) let right_hangul_offset = u32::from(right_c).wrapping_sub(HANGUL_S_BASE); if right_hangul_offset < HANGUL_S_COUNT { let left_hangul_offset = u32::from(left_c).wrapping_sub(HANGUL_S_BASE); if left_hangul_offset < HANGUL_S_COUNT { hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,); } } } } // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. }, variable_top, ); compare!( /// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates /// are compared as if each one was a REPLACEMENT CHARACTER. , compare_utf16, [u16], [u16], split_prefix_u16, chars_with_trie, chars_with_trie, self, left_tail, right_tail, { let tailoring_trie = &self.tailoring.trie; if let Some(left_u) = left_tail.first() { if let Some(right_u) = right_tail.first() { let left_u16 = *left_u; let right_u16 = *right_u; let left_u32 = tailoring_trie.get16(left_u16); let right_u32 = tailoring_trie.get16(right_u16); let left_ce32 = CollationElement32::new(left_u32); let right_ce32 = CollationElement32::new(right_u32); if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(self.tailoring) { if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) { quick_primary_compare!(left_primary, right_primary, variable_top, self,); } } // Try Hangul. Not putting in an `else` of the above consistent with UTF-8. let left_hangul_offset = u32::from(left_u16).wrapping_sub(HANGUL_S_BASE); if left_hangul_offset < HANGUL_S_COUNT { if let Some(right_u) = right_tail.first() { let right_u16 = *right_u; let right_hangul_offset = u32::from(right_u16).wrapping_sub(HANGUL_S_BASE); if right_hangul_offset < HANGUL_S_COUNT { hangul_syllable_compare!(left_hangul_offset, right_hangul_offset,); } } } } } // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. }, variable_top, ); compare!( /// Compare Latin1 slices. /// /// ✨ *Enabled with the `latin1` Cargo feature.* #[cfg(feature = "latin1")] , compare_latin1, [u8], [u8], split_prefix_latin1, latin1_chars_with_trie, latin1_chars_with_trie, self, left_tail, right_tail, { let tailoring_trie = &self.tailoring.trie; if let Some(left_u) = left_tail.first() { if let Some(right_u) = right_tail.first() { let left_u8 = *left_u; let right_u8 = *right_u; // The probability of getting a non-simple ce32 from // the Latin1 part above ASCII is so high that let's // only consider ASCII on the left for this fast path. // SAFETY: Checking the invariant of `get7` here. if left_u8 < 0x80 { // SAFETY: Invariant of `get7` checked above. let left_u32 = unsafe { tailoring_trie.get7(left_u8) }; let left_ce32 = CollationElement32::new(left_u32); // SAFETY: Checking the invariant of `get7` here for right. if right_u8 < 0x80 { // SAFETY: Invariant of `get7` checked above. let right_u32 = unsafe { tailoring_trie.get7(right_u8) }; let right_ce32 = CollationElement32::new(right_u32); // Should be use script reordering to cater to reordering // digets or punctuation relative to letters? if let Some(left_primary) = left_ce32.to_primary_simple() { if let Some(right_primary) = right_ce32.to_primary_simple() { if (left_primary != right_primary) && (left_primary != 0) && (right_primary != 0) && !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) && !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) { if left_primary < right_primary { return Ordering::Less; } return Ordering::Greater; } } } } } } } // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. }, variable_top, ); compare!( /// Compare Latin1 slice with potentially ill-formed UTF-16 /// slice. /// /// If you need to compare a potentially ill-formed UTF-16 /// slice with a Latin1 slice, swap the arguments and /// call `reverse()` on the return value. /// /// ✨ *Enabled with the `latin1` Cargo feature.* #[cfg(feature = "latin1")] , compare_latin1_utf16, [u8], [u16], split_prefix_latin1_utf16, latin1_chars_with_trie, chars_with_trie, self, left_tail, right_tail, { let tailoring_trie = &self.tailoring.trie; if let Some(left_u) = left_tail.first() { if let Some(right_u) = right_tail.first() { let left_u8 = *left_u; // The probability of getting a non-simple ce32 from // the Latin1 part above ASCII is so high that let's // only consider ASCII on the left for this fast path. // SAFETY: Checking the invariant of `get7` here. if left_u8 < 0x80 { // SAFETY: Invariant of `get7` checked above. let left_u32 = unsafe { tailoring_trie.get7(left_u8) }; let left_ce32 = CollationElement32::new(left_u32); let right_u16 = *right_u; let right_u32 = tailoring_trie.get16(right_u16); let right_ce32 = CollationElement32::new(right_u32); if let Some(mut left_primary) = left_ce32.to_primary_simple() { // Don't use the macro to micro-optimize away the long primary // case for ASCII. if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(self.tailoring) { if (left_primary != right_primary) && (left_primary != 0) && (right_primary != 0) && !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) && !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) { if let Some(reordering) = &self.reordering { left_primary = reordering.reorder(left_primary); right_primary = reordering.reorder(right_primary); } if left_primary < right_primary { return Ordering::Less; } return Ordering::Greater; } } } } } } // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. }, variable_top, ); #[inline(always)] fn numeric_primary(&self) -> Option { if self.options.numeric() { Some(self.special_primaries.numeric_primary) } else { None } } #[inline(always)] fn variable_top(&self) -> u32 { if self.options.alternate_handling() == AlternateHandling::NonIgnorable { 0 } else { // +1 so that we can use "<" and primary ignorables test out early. self.special_primaries .last_primary_for_group(self.options.max_variable()) + 1 } } /// The implementation of the comparison operation. /// /// `head_chars` is an iterator _backward_ over the identical /// prefix and `left_chars` and `right_chars` are iterators /// _forward_ over the parts after the identical prefix. fn compare_impl( &'data self, left_chars: L, right_chars: R, mut head_chars: H, variable_top: u32, ) -> Ordering where L: Iterator + WithTrie<'data, T, u32> + 'data, R: Iterator + WithTrie<'data, T, u32> + 'data, H: DoubleEndedIterator + 'data, T: AbstractCodePointTrie<'data, u32> + 'data, { // Sadly, it looks like variable CEs and backward second level // require us to store the full 64-bit CEs instead of storing only // the NonPrimary part. // // TODO(#2008): Consider having two monomorphizations of this method: // one that can deal with variables shifted to quaternary and // backward second level and another that doesn't support that // and only stores `NonPrimary` in `left_ces` and `right_ces` // with double the number of stack allocated elements. // Note: These are used only after the identical prefix skipping, // but initializing these up here improves performance at the time // of writing. Presumably the source order affects the stack frame // layout. let mut left_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new(); let mut right_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new(); // The algorithm comes from CollationCompare::compareUpToQuaternary in ICU4C. let mut any_variable = false; let numeric_primary = self.numeric_primary(); let mut left = collation_elements!(self, left_chars, numeric_primary); let mut right = collation_elements!(self, right_chars, numeric_primary); // Start identical prefix // The logic here to check whether the boundary found by skipping // the identical prefix is safe is complicated compared to the ICU4C // approach of having a set of characters that are unsafe as the character // immediately following the identical prefix. However, the approach here // avoids extra data, and working on the main data avoids the bug // possibility of data structures not being mutually consistent. // This code intentionally does not keep around the `CollationElement32`s // that have been read from the collation data tries, because keeping // them around turned out to be a pessimization: There would be added // branches on the hot path of the algorithm that maps characters to // collation elements, and the element size of the upcoming buffer // would grow. // // However, the values read from the normalization trie _are_ kept around, // since there is already a place where to put them. // This loop is only broken out of as goto forward. #[expect(clippy::never_loop)] 'prefix: loop { if let Some((mut head_last_c, head_last_trie_val)) = head_chars.next_back() { let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val( head_last_c, head_last_trie_val, ); let mut head_last_ce32 = CollationElement32::default(); let mut head_last_ok = false; if let Some(left_different) = left.iter_next_before_init() { left.prepend_upcoming_before_init(left_different.clone()); if let Some(right_different) = right.iter_next_before_init() { // Note: left_different and right_different may both be U+FFFD. right.prepend_upcoming_before_init(right_different.clone()); // The base logic is that a boundary between two starters // that decompose to selves is safe iff the starter // before the boundary can't contract a starter, the // starter after the boundary doesn't have a prefix // condition, and, with the numeric mode enabled, // they aren't both numeric. // // This base logic is then extended with Hangul // syllables and characters that decompose to a // BMP starter followed by a BMP non-starter. // The logic could be extended further, in // particular to cover singleton decompositions // to a BMP starter, but such characters can be // expected to be rare enough in real-world input // that it's not worthwhile to make this code more // branchy. // // A Hangul syllable is safe on either side of the // boundary, because Hangul syllables can't participate // in contraction or have prefix conditions. They are // also known not to be numeric. // // Hangul jamo is safe to look up from the main trie // instead of the jamo table, because they aren't // allowed to participate in contractions or prefix // conditions, either, and are known not to be numeric. // // After a boundary, a decomposition to a BMP starter // and a BMP non-starter can obviously be analyzed by // considering the starter as if it was a starter // that decomposes to self. // // Before a boundary the contraction condition considers // whether the contraction can contract a starter. // For the case of contracting a non-starter, it's // fine for the BMP starter of the decomposition to // contract the non-starter from the same decomposition: // Since that would happen as part of the prefix that // is identical, it wouldn't affect anything after. // // The case of contracting a non-starter other than // the one that came from the decomposition itself // is irrelevant, because we don't allow a non-starter // right after the boundary regardless of the contraction // status of what's before the boundary. // // Finally, decompositions to starter and non-starter // are known not to be numeric. // The checks below are repetitive, but an attempt to factor // repetitive code into an inlined function regressed, // performance, so it seems that having the control flow // right here without an intermediate enum from a // function return to branch on is important. // This loop is only broken out of as goto forward. The control flow // is much more readable this way. #[expect(clippy::never_loop)] loop { // The two highest bits are about NFC, which we don't // care about here. let decomposition = head_last.trie_val; if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { // Intentionally empty block to keep // the same structure as in the cases // where something happens here. } else if ((decomposition & HIGH_ZEROS_MASK) != 0) && ((decomposition & LOW_ZEROS_MASK) != 0) { // Decomposition into two BMP characters: starter and non-starter // Let's take the starter head_last_c = char_from_u32(decomposition & 0x7FFF); } else if decomposition == HANGUL_SYLLABLE_MARKER { head_last_ce32 = IDENTICAL_PREFIX_HANGUL_MARKER_CE32; } else { break; } head_last_ok = true; let left_c; let right_c; let decomposition = left_different.trie_val; if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { left_c = left_different.character(); } else if ((decomposition & HIGH_ZEROS_MASK) != 0) && ((decomposition & LOW_ZEROS_MASK) != 0) { // Decomposition into two BMP characters: starter and non-starter // Let's take the starter left_c = char_from_u32(decomposition & 0x7FFF); } else if decomposition == HANGUL_SYLLABLE_MARKER { left_c = left_different.character(); if right_different.trie_val == HANGUL_SYLLABLE_MARKER { // Hangul syllable to Hangul syllable comparison can give us // a quick exit. let left_hangul_offset = u32::from(left_c).wrapping_sub(HANGUL_S_BASE); let right_hangul_offset = u32::from(right_different.character()) .wrapping_sub(HANGUL_S_BASE); // If these are not in range, we have a memory-safe GIGO case. debug_assert!(left_hangul_offset < HANGUL_S_COUNT); debug_assert!(right_hangul_offset < HANGUL_S_COUNT); hangul_syllable_compare!( left_hangul_offset, right_hangul_offset, ); // We had a two-jamo syllable whose jamo matched // the first two jamo of the other syllable. // Still, we're at a good boundary. break 'prefix; } } else { break; } let decomposition = right_different.trie_val; if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { right_c = right_different.character(); } else if ((decomposition & HIGH_ZEROS_MASK) != 0) && ((decomposition & LOW_ZEROS_MASK) != 0) { // Decomposition into two BMP characters: starter and non-starter // Let's take the starter right_c = char_from_u32(decomposition & 0x7FFF); } else if decomposition == HANGUL_SYLLABLE_MARKER { right_c = right_different.character(); } else { break; } // The last character of the prefix is OK on the normalization // level. Now let's check its ce32 unless it's a Hangul syllable, // in which case `head_last_ce32` already is a non-default placeholder. if head_last_ce32 == CollationElement32::default() { head_last_ce32 = self.tailoring.ce32_for_char(head_last_c); if head_last_ce32 == FALLBACK_CE32 { head_last_ce32 = self.root.ce32_for_char(head_last_c); } if head_last_ce32.tag_checked() == Some(Tag::Contraction) && head_last_ce32.at_least_one_suffix_contains_starter() { break; } } let mut left_data = self.tailoring; let mut left_ce32 = self.tailoring.ce32_for_char(left_c); if left_ce32 == FALLBACK_CE32 { left_ce32 = self.root.ce32_for_char(left_c); left_data = self.root; } let mut right_data = self.tailoring; let mut right_ce32 = self.tailoring.ce32_for_char(right_c); if right_ce32 == FALLBACK_CE32 { right_ce32 = self.root.ce32_for_char(right_c); right_data = self.root; } // We don't actually need to do this. // if self.options.numeric() // && head_last_ce32.tag_checked() == Some(Tag::Digit) // && (left_ce32.tag_checked() == Some(Tag::Digit) // || right_ce32.tag_checked() == Some(Tag::Digit)) // { // break; // } // We might be at at a good boundary unless either ce32 is a // prefix ce32. Let's check for the happy path first, though. // Now check if we ce32s we have are simple enough to // make a quick decision here. if let Some(mut left_primary) = left_ce32.to_primary_in_quick_check(left_data) { if let Some(mut right_primary) = right_ce32.to_primary_in_quick_check(right_data) { quick_primary_compare!( left_primary, right_primary, variable_top, self, ); } } if left_ce32.tag_checked() == Some(Tag::Prefix) || right_ce32.tag_checked() == Some(Tag::Prefix) { // TODO: For the Japanese tailoring, it might actually be worthwhile // to handle the prefix ce32s inline above the quick check. We've already // read the last character from `head` anyway. break; } // We're at a good boundary but could not make a quick primary comparison decision. // Note: It might look like a good idea to cache the CE32s, but // doing so actually makes things slower. break 'prefix; } } } let mut tail_first_c; let mut tail_first_ce32; let mut tail_first_ok; loop { // Take a step back. left.prepend_upcoming_before_init(head_last.clone()); right.prepend_upcoming_before_init(head_last.clone()); tail_first_c = head_last_c; tail_first_ce32 = head_last_ce32; tail_first_ok = head_last_ok; let Some((head_last_c_new, decomposition)) = head_chars.next_back() else { // We need to step back beyond the start of the prefix. // Treat as good boundary. break 'prefix; }; head_last_c = head_last_c_new; head_last = CharacterAndClassAndTrieValue::new_with_trie_val( head_last_c, decomposition, ); head_last_ce32 = CollationElement32::default(); head_last_ok = false; if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { // Intentionally empty block to keep // the same structure as in the cases // where something happens here. } else if ((decomposition & HIGH_ZEROS_MASK) != 0) && ((decomposition & LOW_ZEROS_MASK) != 0) { // Decomposition into two BMP characters: starter and non-starter // Let's take the starter head_last_c = char_from_u32(decomposition & 0x7FFF); } else if decomposition == HANGUL_SYLLABLE_MARKER { head_last_ce32 = FFFD_CE32; } else { continue; } head_last_ok = true; if !tail_first_ok { continue; } // The last character of the prefix is OK on the normalization // level. Now let's check its ce32 unless it's a Hangul syllable. if head_last_ce32 == CollationElement32::default() { head_last_ce32 = self.tailoring.ce32_for_char(head_last_c); if head_last_ce32 == FALLBACK_CE32 { head_last_ce32 = self.root.ce32_for_char(head_last_c); } if head_last_ce32.tag_checked() == Some(Tag::Contraction) && head_last_ce32.at_least_one_suffix_contains_starter() { continue; } } // Check this _after_ `head_last_ce32` to make sure // `head_last_ce32` is initialized for the next loop round // trip if applicable. if tail_first_ce32 == CollationElement32::default() { tail_first_ce32 = self.tailoring.ce32_for_char(tail_first_c); if tail_first_ce32 == FALLBACK_CE32 { tail_first_ce32 = self.root.ce32_for_char(tail_first_c); } } // else we already have a trie value from the previous loop iteration or we have Hangul syllable if tail_first_ce32.tag_checked() == Some(Tag::Prefix) { continue; } if self.options.numeric() && head_last_ce32.tag_checked() == Some(Tag::Digit) && tail_first_ce32.tag_checked() == Some(Tag::Digit) { continue; } // We are at a good boundary! break 'prefix; } } else { // The prefix is empty break 'prefix; } // Unreachable line } // End identical prefix left.init(); right.init(); loop { let mut left_primary; 'left_primary_loop: loop { let ce = left.next(); left_primary = ce.primary(); // TODO(#2008): Consider compiling out the variable handling when we know we aren't // shifting variable CEs. if !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) { left_ces.push(ce); } else { // Variable CE, shift it to quaternary level. // Ignore all following primary ignorables, and shift further variable CEs. any_variable = true; // Relative to ICU4C, the next line is hoisted out of the following loop // in order to keep the variables called `ce` immutable to make it easier // to reason about each assignment into `ce` resulting in exactly a single // push into `left_ces`. left_ces.push(ce.clone_with_non_primary_zeroed()); loop { // This loop is simpler than in ICU4C; unlike in C++, we get to break by label. let ce = left.next(); left_primary = ce.primary(); if left_primary != 0 && !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) { // Neither a primary ignorable nor a variable CE. left_ces.push(ce); break 'left_primary_loop; } // If `left_primary == 0`, the following line ignores a primary-ignorable. // Otherwise, it shifts a variable CE. left_ces.push(ce.clone_with_non_primary_zeroed()); } } if left_primary != 0 { break; } } let mut right_primary; 'right_primary_loop: loop { let ce = right.next(); right_primary = ce.primary(); // TODO(#2008): Consider compiling out the variable handling when we know we aren't // shifting variable CEs. if !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) { right_ces.push(ce); } else { // Variable CE, shift it to quaternary level. // Ignore all following primary ignorables, and shift further variable CEs. any_variable = true; // Relative to ICU4C, the next line is hoisted out of the following loop // in order to keep the variables called `ce` immutable to make it easier // to reason about each assignment into `ce` resulting in exactly a single // push into `right_ces`. right_ces.push(ce.clone_with_non_primary_zeroed()); loop { // This loop is simpler than in ICU4C; unlike in C++, we get to break by label. let ce = right.next(); right_primary = ce.primary(); if right_primary != 0 && !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) { // Neither a primary ignorable nor a variable CE. right_ces.push(ce); break 'right_primary_loop; } // If `right_primary == 0`, the following line ignores a primary-ignorable. // Otherwise, it shifts a variable CE. right_ces.push(ce.clone_with_non_primary_zeroed()); } } if right_primary != 0 { break; } } if left_primary != right_primary { if let Some(reordering) = &self.reordering { left_primary = reordering.reorder(left_primary); right_primary = reordering.reorder(right_primary); } if left_primary < right_primary { return Ordering::Less; } return Ordering::Greater; } if left_primary == NO_CE_PRIMARY { break; } } // Sadly, we end up pushing the sentinel value, which means these // `SmallVec`s allocate more often than if we didn't actually // store the sentinel. debug_assert_eq!(left_ces.last(), Some(&NO_CE)); debug_assert_eq!(right_ces.last(), Some(&NO_CE)); // Note: `unwrap_or_default` in the iterations below should never // actually end up using the "_or_default" part, because the sentinel // is in the `SmallVec`s. These could be changed to `unwrap()` if we // preferred panic in case of a bug. // TODO(#2009): Should we save one slot by not putting the sentinel in // the `SmallVec`s? So far, the answer seems "no", as it would complicate // the primary comparison above. // Compare the buffered secondary & tertiary weights. // We might skip the secondary level but continue with the case level // which is turned on separately. if self.options.strength() >= Strength::Secondary { if !self.options.backward_second_level() { let mut left_iter = left_ces.iter(); let mut right_iter = right_ces.iter(); let mut left_secondary; let mut right_secondary; loop { loop { left_secondary = left_iter.next().unwrap_or_default().secondary(); if left_secondary != 0 { break; } } loop { right_secondary = right_iter.next().unwrap_or_default().secondary(); if right_secondary != 0 { break; } } if left_secondary != right_secondary { if left_secondary < right_secondary { return Ordering::Less; } return Ordering::Greater; } if left_secondary == NO_CE_SECONDARY { break; } } } else { let mut left_remaining = &left_ces[..]; let mut right_remaining = &right_ces[..]; loop { if left_remaining.is_empty() { debug_assert!(right_remaining.is_empty()); break; } let (left_prefix, right_prefix) = { let mut left_iter = left_remaining.iter(); loop { let left_primary = left_iter.next().unwrap_or_default().primary(); if left_primary != 0 && left_primary <= MERGE_SEPARATOR_PRIMARY { break; } debug_assert_ne!(left_primary, NO_CE_PRIMARY); } let left_new_remaining = left_iter.as_slice(); // Index in range by construction #[expect(clippy::indexing_slicing)] let left_prefix = &left_remaining[..left_remaining.len() - 1 - left_new_remaining.len()]; left_remaining = left_new_remaining; let mut right_iter = right_remaining.iter(); loop { let right_primary = right_iter.next().unwrap_or_default().primary(); if right_primary != 0 && right_primary <= MERGE_SEPARATOR_PRIMARY { break; } debug_assert_ne!(right_primary, NO_CE_PRIMARY); } let right_new_remaining = right_iter.as_slice(); // Index in range by construction #[expect(clippy::indexing_slicing)] let right_prefix = &right_remaining [..right_remaining.len() - 1 - right_new_remaining.len()]; right_remaining = right_new_remaining; (left_prefix, right_prefix) }; let mut left_iter = left_prefix.iter(); let mut right_iter = right_prefix.iter(); let mut left_secondary; let mut right_secondary; loop { loop { left_secondary = left_iter.next_back().unwrap_or_default().secondary(); if left_secondary != 0 { break; } } loop { right_secondary = right_iter.next_back().unwrap_or_default().secondary(); if right_secondary != 0 { break; } } if left_secondary != right_secondary { if left_secondary < right_secondary { return Ordering::Less; } return Ordering::Greater; } if left_secondary == NO_CE_SECONDARY { break; } } } } } if self.options.case_level() { let mut left_non_primary; let mut right_non_primary; let mut left_case; let mut right_case; let mut left_iter = left_ces.iter(); let mut right_iter = right_ces.iter(); if self.options.strength() == Strength::Primary { // Primary+caseLevel: Ignore case level weights of primary ignorables. // Otherwise we would get a-umlaut > a // which is not desirable for accent-insensitive sorting. // Check for (lower 32 bits) == 0 as well because variable CEs are stored // with only primary weights. loop { loop { let ce = left_iter.next().unwrap_or_default(); left_non_primary = ce.non_primary(); if !ce.either_half_zero() { break; } } left_case = left_non_primary.case(); loop { let ce = right_iter.next().unwrap_or_default(); right_non_primary = ce.non_primary(); if !ce.either_half_zero() { break; } } right_case = right_non_primary.case(); // No need to handle NO_CE and MERGE_SEPARATOR specially: // There is one case weight for each previous-level weight, // so level length differences were handled there. if left_case != right_case { if !self.options.upper_first() { if left_case < right_case { return Ordering::Less; } return Ordering::Greater; } if left_case < right_case { return Ordering::Greater; } return Ordering::Less; } if left_non_primary.secondary() == NO_CE_SECONDARY { break; } } } else { // Secondary+caseLevel: By analogy with the above, // ignore case level weights of secondary ignorables. // // Note: A tertiary CE has uppercase case bits (0.0.ut) // to keep tertiary+caseFirst well-formed. // // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. // Otherwise a tertiary CE's uppercase would be no greater than // a primary/secondary CE's uppercase. // (See UCA well-formedness condition 2.) // We could construct a special case weight higher than uppercase, // but it's simpler to always ignore case weights of secondary ignorables, // turning 0.0.ut into 0.0.0.t. // (See LDML Collation, Case Parameters.) loop { loop { left_non_primary = left_iter.next().unwrap_or_default().non_primary(); if left_non_primary.secondary() != 0 { break; } } left_case = left_non_primary.case(); loop { right_non_primary = right_iter.next().unwrap_or_default().non_primary(); if right_non_primary.secondary() != 0 { break; } } right_case = right_non_primary.case(); // No need to handle NO_CE and MERGE_SEPARATOR specially: // There is one case weight for each previous-level weight, // so level length differences were handled there. if left_case != right_case { if !self.options.upper_first() { if left_case < right_case { return Ordering::Less; } return Ordering::Greater; } if left_case < right_case { return Ordering::Greater; } return Ordering::Less; } if left_non_primary.secondary() == NO_CE_SECONDARY { break; } } } } if let Some(tertiary_mask) = self.options.tertiary_mask() { let mut any_quaternaries = AnyQuaternaryAccumulator::new(); let mut left_iter = left_ces.iter(); let mut right_iter = right_ces.iter(); loop { let mut left_non_primary; let mut left_tertiary; loop { left_non_primary = left_iter.next().unwrap_or_default().non_primary(); any_quaternaries.accumulate(left_non_primary); debug_assert!( left_non_primary.tertiary() != 0 || left_non_primary.case_quaternary() == 0 ); left_tertiary = left_non_primary.tertiary_case_quarternary(tertiary_mask); if left_tertiary != 0 { break; } } let mut right_non_primary; let mut right_tertiary; loop { right_non_primary = right_iter.next().unwrap_or_default().non_primary(); any_quaternaries.accumulate(right_non_primary); debug_assert!( right_non_primary.tertiary() != 0 || right_non_primary.case_quaternary() == 0 ); right_tertiary = right_non_primary.tertiary_case_quarternary(tertiary_mask); if right_tertiary != 0 { break; } } if left_tertiary != right_tertiary { if self.options.upper_first() { // Pass through NO_CE and keep real tertiary weights larger than that. // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), // to keep tertiary CEs well-formed. // Their case+tertiary weights must be greater than those of // primary and secondary CEs. // Magic numbers from ICU4C. if left_tertiary > NO_CE_TERTIARY { if left_non_primary.secondary() != 0 { left_tertiary ^= 0xC000; } else { left_tertiary += 0x4000; } } if right_tertiary > NO_CE_TERTIARY { if right_non_primary.secondary() != 0 { right_tertiary ^= 0xC000; } else { right_tertiary += 0x4000; } } } if left_tertiary < right_tertiary { return Ordering::Less; } return Ordering::Greater; } if left_tertiary == NO_CE_TERTIARY { break; } } if !any_variable && !any_quaternaries.has_quaternary() { return Ordering::Equal; } } else { return Ordering::Equal; } if self.options.strength() <= Strength::Tertiary { return Ordering::Equal; } let mut left_iter = left_ces.iter(); let mut right_iter = right_ces.iter(); loop { let mut left_quaternary; loop { let ce = left_iter.next().unwrap_or_default(); if ce.tertiary_ignorable() { left_quaternary = ce.primary(); } else { left_quaternary = ce.quaternary(); } if left_quaternary != 0 { break; } } let mut right_quaternary; loop { let ce = right_iter.next().unwrap_or_default(); if ce.tertiary_ignorable() { right_quaternary = ce.primary(); } else { right_quaternary = ce.quaternary(); } if right_quaternary != 0 { break; } } if left_quaternary != right_quaternary { if let Some(reordering) = &self.reordering { left_quaternary = reordering.reorder(left_quaternary); right_quaternary = reordering.reorder(right_quaternary); } if left_quaternary < right_quaternary { return Ordering::Less; } return Ordering::Greater; } if left_quaternary == NO_CE_PRIMARY { break; } } Ordering::Equal } fn sort_key_levels(&self) -> u8 { #[expect(clippy::indexing_slicing)] let mut levels = LEVEL_MASKS[self.options.strength() as usize]; if self.options.case_level() { levels |= CASE_LEVEL_FLAG; } levels } /// Given valid UTF-8, write the sort key bytes up to the collator's strength. /// /// The bytes are written to an implementor of [`CollationKeySink`], a no-std version of `std::io::Write`. /// This trait is currently unstable, but it is implemented by `Vec`, `VecDeque`, /// `SmallVec<[u8; N]>`, and `&mut [u8]` (returning an error if the slice is too small). /// /// If two sort keys generated at the same strength are compared bytewise, the result is /// the same as a collation comparison of the original strings at that strength. /// /// For identical strength, the UTF-8 NFD normalization is appended for breaking ties. /// /// No terminating zero byte is written to the output, so the output is not a valid C /// string, but the caller may append a zero afterward if a C string is desired. /// /// ⚠️ Generating a sort key is expensive relative to comparison because to compare, the /// collator skips identical prefixes before doing more complex comparison. Only use sort /// keys if you expect to compare them many times so as to amortize the cost of generating /// them. Measurement of this performance trade-off would be a good idea. /// /// ⚠️ Sort keys, if stored durably, should be presumed to be invalidated by a CLDR update, a /// new version of Unicode, or an update to the ICU4X code. Applications using sort keys /// *must* be prepared to recompute them if required and should take the performance of /// such an operation into account when deciding to use sort keys. /// /// ⚠️ If you should store sort keys in a database that is or becomes so large that /// regenerating sort keys becomes impractical, you should not expect ICU4X to support your /// using an older, frozen copy of the sort key generation algorithm with a later version /// of the library. /// /// # Example /// /// ``` /// use icu_collator::{ /// options::{CollatorOptions, Strength}, /// Collator, /// }; /// use icu_locale::locale; /// let locale = locale!("utf").into(); /// let mut options = CollatorOptions::default(); /// options.strength = Some(Strength::Primary); /// let collator = Collator::try_new(locale, options).unwrap(); /// /// let mut k1 = Vec::new(); /// let Ok(()) = collator.write_sort_key_to("hello", &mut k1); /// let mut k2 = Vec::new(); /// let Ok(()) = collator.write_sort_key_to("Héłłö", &mut k2); /// assert_eq!(k1, k2); /// ``` pub fn write_sort_key_to(&self, s: &str, sink: &mut S) -> Result where S: CollationKeySink + ?Sized, S::State: Default, { self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink) } /// Given potentially invalid UTF-8, write the sort key bytes up to the collator's strength. /// /// For further details, see [`Self::write_sort_key_to`]. pub fn write_sort_key_utf8_to(&self, s: &[u8], sink: &mut S) -> Result where S: CollationKeySink + ?Sized, S::State: Default, { self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink) } /// Given potentially invalid UTF-16, write the sort key bytes up to the collator's strength. /// /// For further details, see [`Self::write_sort_key_to`]. pub fn write_sort_key_utf16_to(&self, s: &[u16], sink: &mut S) -> Result where S: CollationKeySink + ?Sized, S::State: Default, { self.write_sort_key_impl(s.chars_with_trie(self.norm_trie()), sink) } fn write_sort_key_impl( &'data self, iter: I, sink: &mut S, ) -> Result where I: Iterator + WithTrie<'data, T, u32> + Clone + 'data, T: AbstractCodePointTrie<'data, u32> + 'data, S: CollationKeySink + ?Sized, S::State: Default, { let identical = if self.options.strength() == Strength::Identical { Some(iter.clone()) } else { None }; let mut state = S::State::default(); self.write_sort_key_up_to_quaternary(iter, sink, &mut state)?; if let Some(iter) = identical { sink.write_byte(&mut state, LEVEL_SEPARATOR_BYTE)?; let mut iter = icu_normalizer::new_decomposition(iter, self.tables); let _ = iter.next(); // Discard the U+0000. write_identical_level(iter, sink, &mut state)?; } sink.finish(state) } /// Write the sort key bytes up to the collator's strength. /// /// Optionally write the case level. Separate levels with the `LEVEL_SEPARATOR_BYTE`, but /// do not write a terminating zero as with a C string. fn write_sort_key_up_to_quaternary( &'data self, iter: I, sink: &mut S, state: &mut S::State, ) -> Result<(), S::Error> where I: Iterator + WithTrie<'data, T, u32> + Clone + 'data, T: AbstractCodePointTrie<'data, u32> + 'data, S: CollationKeySink + ?Sized, { // This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C. let levels = self.sort_key_levels(); let mut iter = collation_elements!(self, iter, self.numeric_primary()); iter.init(); let variable_top = self.variable_top(); let tertiary_mask = self.options.tertiary_mask().unwrap_or_default(); let mut cases = SortKeyLevel::default(); let mut secondaries = SortKeyLevel::default(); let mut tertiaries = SortKeyLevel::default(); let mut quaternaries = SortKeyLevel::default(); let mut prev_reordered_primary = 0; let mut common_cases = 0usize; let mut common_secondaries = 0usize; let mut common_tertiaries = 0usize; let mut common_quaternaries = 0usize; let mut prev_secondary = 0; let mut sec_segment_start = 0; loop { let mut ce = iter.next(); let mut p = ce.primary(); if p < variable_top && p > MERGE_SEPARATOR_PRIMARY { // Variable CE, shift it to quaternary level. Ignore all following primary // ignorables, and shift further variable CEs. if common_quaternaries != 0 { common_quaternaries -= 1; while common_quaternaries >= QUAT_COMMON[WEIGHT_MAX_COUNT] as _ { quaternaries.append_byte(QUAT_COMMON[WEIGHT_MIDDLE]); common_quaternaries -= QUAT_COMMON[WEIGHT_MAX_COUNT] as usize; } // Shifted primary weights are lower than the common weight. quaternaries.append_byte(QUAT_COMMON[WEIGHT_LOW] + common_quaternaries as u8); common_quaternaries = 0; } loop { if levels & QUATERNARY_LEVEL_FLAG != 0 { if let Some(reordering) = &self.reordering { p = reordering.reorder(p); } if (p >> 24) as u8 >= QUAT_SHIFTED_LIMIT_BYTE { // Prevent shifted primary lead bytes from overlapping with the // common compression range. quaternaries.append_byte(QUAT_SHIFTED_LIMIT_BYTE); } quaternaries.append_weight_32(p); } loop { ce = iter.next(); p = ce.primary(); if p != 0 { break; } } if !(p < variable_top && p > MERGE_SEPARATOR_PRIMARY) { break; } } } // ce could be primary ignorable, or NO_CE, or the merge separator, or a regular // primary CE, but it is not variable. If ce == NO_CE, then write nothing for the // primary level but terminate compression on all levels and then exit the loop. if p > NO_CE_PRIMARY && levels & PRIMARY_LEVEL_FLAG != 0 { // Test the un-reordered primary for compressibility. let is_compressible = self.special_primaries.is_compressible((p >> 24) as _); if let Some(reordering) = &self.reordering { p = reordering.reorder(p); } let p1 = (p >> 24) as u8; if !is_compressible || p1 != (prev_reordered_primary >> 24) as u8 { if prev_reordered_primary != 0 { if p < prev_reordered_primary { // No primary compression terminator at the end of the level or // merged segment. if p1 > MERGE_SEPARATOR_BYTE { sink.write(state, &[PRIMARY_COMPRESSION_LOW_BYTE])?; } } else { sink.write(state, &[PRIMARY_COMPRESSION_HIGH_BYTE])?; } } sink.write_byte(state, p1)?; prev_reordered_primary = if is_compressible { p } else { 0 }; } let p2 = (p >> 16) as u8; if p2 != 0 { let (b0, b1, b2) = (p2, (p >> 8) as _, p as _); sink.write_byte(state, b0)?; if b1 != 0 { sink.write_byte(state, b1)?; if b2 != 0 { sink.write_byte(state, b2)?; } } } } let non_primary = ce.non_primary(); if non_primary.ignorable() { continue; // completely ignorable, no secondary/case/tertiary/quaternary } macro_rules! handle_common { ($key:ident, $w:ident, $common:ident, $weights:ident, $lim:expr) => { if $common != 0 { $common -= 1; while $common >= $weights[WEIGHT_MAX_COUNT] as _ { $key.append_byte($weights[WEIGHT_MIDDLE]); $common -= $weights[WEIGHT_MAX_COUNT] as usize; } let b = if $w < $lim { $weights[WEIGHT_LOW] + ($common as u8) } else { $weights[WEIGHT_HIGH] - ($common as u8) }; $key.append_byte(b); $common = 0; } }; ($key:ident, $w:ident, $common:ident, $weights:ident) => { handle_common!($key, $w, $common, $weights, COMMON_WEIGHT16); }; } if levels & SECONDARY_LEVEL_FLAG != 0 { let s = non_primary.secondary(); if s == 0 { // secondary ignorable } else if s == COMMON_WEIGHT16 && (!self.options.backward_second_level() || p != MERGE_SEPARATOR_PRIMARY) { // s is a common secondary weight, and backwards-secondary is off or the ce // is not the merge separator. common_secondaries += 1; } else if !self.options.backward_second_level() { handle_common!(secondaries, s, common_secondaries, SEC_COMMON); secondaries.append_weight_16(s); } else { if common_secondaries != 0 { common_secondaries -= 1; // Append reverse weights. The level will be re-reversed later. let remainder = common_secondaries % SEC_COMMON[WEIGHT_MAX_COUNT] as usize; let b = if prev_secondary < COMMON_WEIGHT16 { SEC_COMMON[WEIGHT_LOW] + remainder as u8 } else { SEC_COMMON[WEIGHT_HIGH] - remainder as u8 }; secondaries.append_byte(b); common_secondaries -= remainder; // common_secondaries is now a multiple of SEC_COMMON[WEIGHT_MAX_COUNT] while common_secondaries > 0 { // same as >= SEC_COMMON[WEIGHT_MAX_COUNT] secondaries.append_byte(SEC_COMMON[WEIGHT_MIDDLE]); common_secondaries -= SEC_COMMON[WEIGHT_MAX_COUNT] as usize; } // commonSecondaries == 0 } if 0 < p && p <= MERGE_SEPARATOR_PRIMARY { // The backwards secondary level compares secondary weights backwards // within segments separated by the merge separator (U+FFFE). let secs = &mut secondaries.buf; let last = secs.len() - 1; if sec_segment_start < last { let mut q = sec_segment_start; let mut r = last; // these indices start at valid values and we stop when they cross #[expect(clippy::indexing_slicing)] while q < r { let b = secs[q]; secs[q] = secs[r]; q += 1; secs[r] = b; r -= 1; } } let b = if p == NO_CE_PRIMARY { LEVEL_SEPARATOR_BYTE } else { MERGE_SEPARATOR_BYTE }; secondaries.append_byte(b); prev_secondary = 0; sec_segment_start = secondaries.len(); } else { secondaries.append_reverse_weight_16(s); prev_secondary = s; } } } if levels & CASE_LEVEL_FLAG != 0 { if self.options.strength() == Strength::Primary && p == 0 || non_primary.bits() <= 0xffff { // Primary+caseLevel: Ignore case level weights of primary ignorables. // Otherwise: Ignore case level weights of secondary ignorables. For // details see the comments in the CollationCompare class. } else { // case bits & tertiary lead byte let mut c = ((non_primary.bits() >> 8) & 0xff) as u8; debug_assert_ne!(c & 0xc0, 0xc0); if c & 0xc0 == 0 && c > LEVEL_SEPARATOR_BYTE { common_cases += 1; } else { if !self.options.upper_first() { // lower first: Compress common weights to nibbles 1..7..13, // mixed=14, upper=15. If there are only common (=lowest) weights // in the whole level, then we need not write anything. Level // length differences are handled already on the next-higher level. if common_cases != 0 && (c > LEVEL_SEPARATOR_BYTE || !cases.is_empty()) { common_cases -= 1; while common_cases >= CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _ { cases.append_byte(CASE_LOWER_FIRST_COMMON[WEIGHT_MIDDLE] << 4); common_cases -= CASE_LOWER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize; } let b = if c <= LEVEL_SEPARATOR_BYTE { CASE_LOWER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8 } else { CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] - common_cases as u8 }; cases.append_byte(b << 4); common_cases = 0; } if c > LEVEL_SEPARATOR_BYTE { // 14 or 15 c = (CASE_LOWER_FIRST_COMMON[WEIGHT_HIGH] + (c >> 6)) << 4; } } else { // upper first: Compress common weights to nibbles 3..15, mixed=2, // upper=1. The compressed common case weights only go up from the // "low" value because with upperFirst the common weight is the // highest one. if common_cases != 0 { common_cases -= 1; while common_cases >= CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as _ { cases.append_byte(CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] << 4); common_cases -= CASE_UPPER_FIRST_COMMON[WEIGHT_MAX_COUNT] as usize; } cases.append_byte( (CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] + common_cases as u8) << 4, ); common_cases = 0; } if c > LEVEL_SEPARATOR_BYTE { // 2 or 1 c = (CASE_UPPER_FIRST_COMMON[WEIGHT_LOW] - (c >> 6)) << 4; } } // c is a separator byte 01 or a left-shifted nibble 0x10, 0x20, ... // 0xf0. cases.append_byte(c); } } } if levels & TERTIARY_LEVEL_FLAG != 0 { let mut t = non_primary.tertiary_case_quarternary(tertiary_mask); debug_assert_ne!(non_primary.bits() & 0xc000, 0xc000); if t == COMMON_WEIGHT16 { common_tertiaries += 1; } else if tertiary_mask & 0x8000 == 0 { // Tertiary weights without case bits. Move lead bytes 06..3F to C6..FF // for a large common-weight range. handle_common!(tertiaries, t, common_tertiaries, TER_ONLY_COMMON); if t > COMMON_WEIGHT16 { t += 0xc000; } tertiaries.append_weight_16(t); } else if !self.options.upper_first() { // Tertiary weights with caseFirst=lowerFirst. Move lead bytes 06..BF to // 46..FF for the common-weight range. handle_common!(tertiaries, t, common_tertiaries, TER_LOWER_FIRST_COMMON); if t > COMMON_WEIGHT16 { t += 0x4000; } tertiaries.append_weight_16(t); } else { // Tertiary weights with caseFirst=upperFirst. Do not change the // artificial uppercase weight of a tertiary CE (0.0.ut), to keep tertiary // CEs well-formed. Their case+tertiary weights must be greater than those // of primary and secondary CEs. // // Separator 01 -> 01 (unchanged) // Lowercase 02..04 -> 82..84 (includes uncased) // Common weight 05 -> 85..C5 (common-weight compression range) // Lowercase 06..3F -> C6..FF // Mixed case 42..7F -> 42..7F // Uppercase 82..BF -> 02..3F // Tertiary CE 86..BF -> C6..FF if t <= NO_CE_TERTIARY { // Keep separators unchanged. } else if non_primary.bits() > 0xffff { // Invert case bits of primary & secondary CEs. t ^= 0xc000; if t < (TER_UPPER_FIRST_COMMON[WEIGHT_HIGH] as u16) << 8 { t -= 0x4000; } } else { // Keep uppercase bits of tertiary CEs. debug_assert!((0x8600..=0xbfff).contains(&t)); t += 0x4000; } handle_common!( tertiaries, t, common_tertiaries, TER_UPPER_FIRST_COMMON, (TER_UPPER_FIRST_COMMON[WEIGHT_LOW] as u16) << 8 ); tertiaries.append_weight_16(t); } } if levels & QUATERNARY_LEVEL_FLAG != 0 { let q = (non_primary.bits() & 0xffff) as u16; if q & 0xc0 == 0 && q > NO_CE_QUATERNARY { common_quaternaries += 1; } else if q == NO_CE_QUATERNARY && self.options.alternate_handling() == AlternateHandling::NonIgnorable && quaternaries.is_empty() { // If alternate=non-ignorable and there are only common quaternary weights, // then we need not write anything. The only weights greater than the // merge separator and less than the common weight are shifted primary // weights, which are not generated for alternate=non-ignorable. There are // also exactly as many quaternary weights as tertiary weights, so level // length differences are handled already on tertiary level. Any // above-common quaternary weight will compare greater regardless. quaternaries.append_byte(LEVEL_SEPARATOR_BYTE); } else { let q = if q == NO_CE_QUATERNARY { LEVEL_SEPARATOR_BYTE } else { (0xfc + ((q >> 6) & 3)) as u8 }; handle_common!( quaternaries, q, common_quaternaries, QUAT_COMMON, QUAT_COMMON[WEIGHT_LOW] ); quaternaries.append_byte(q); } } if (non_primary.bits() >> 24) as u8 == LEVEL_SEPARATOR_BYTE { break; // ce == NO_CE } } macro_rules! write_level { ($key:ident, $flag:ident) => { if levels & $flag != 0 { sink.write(state, &[LEVEL_SEPARATOR_BYTE])?; sink.write(state, &$key.buf)?; } }; } write_level!(secondaries, SECONDARY_LEVEL_FLAG); if levels & CASE_LEVEL_FLAG != 0 { sink.write(state, &[LEVEL_SEPARATOR_BYTE])?; // Write pairs of nibbles as bytes, except separator bytes as themselves. let mut b = 0; if let Some((last, head)) = cases.buf.split_last() { debug_assert_eq!(*last, 1); // The trailing NO_CE for c in head { debug_assert_eq!(*c & 0xf, 0); debug_assert_ne!(*c, 0); if b == 0 { b = *c; } else { sink.write_byte(state, b | (*c >> 4))?; b = 0; } } } else { debug_assert!(false); } if b != 0 { sink.write_byte(state, b)?; } } write_level!(tertiaries, TERTIARY_LEVEL_FLAG); write_level!(quaternaries, QUATERNARY_LEVEL_FLAG); Ok(()) } } /// Error indicating that a [`CollationKeySink`] with limited space ran out of space. #[derive(Debug, PartialEq, Eq)] pub struct TooSmall { /// The total length, in bytes, of the sort key. pub length: usize, } impl TooSmall { pub fn new(length: usize) -> Self { Self { length } } } /// A [`std::io::Write`]-like trait for writing to a buffer-like object. /// /// (This crate does not have access to [`std`].) /// ///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, /// including in SemVer minor releases. Do not implement or call methods on this trait /// unless you are prepared for things to occasionally break. /// /// Graduation tracking issue: [issue #7178](https://github.com/unicode-org/icu4x/issues/7178). ///
/// /// ✨ *Enabled with the `unstable` Cargo feature.* pub trait CollationKeySink { /// The type of error the sink may return. type Error; /// An intermediate state object used by the sink, which must implement [`Default`]. type State; /// A result value indicating the final state of the sink (e.g. a number of bytes written). type Output; /// Writes a buffer into the writer. fn write(&mut self, state: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error>; /// Write a single byte into the writer. fn write_byte(&mut self, state: &mut Self::State, b: u8) -> Result<(), Self::Error> { self.write(state, &[b]) } /// Finalize any internal sink state (perhaps by flushing a buffer) and return the final /// output value. fn finish(&mut self, state: Self::State) -> Result; } impl CollationKeySink for Vec { type Error = Infallible; type State = (); type Output = (); fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> { self.extend_from_slice(buf); Ok(()) } fn finish(&mut self, _: Self::State) -> Result { Ok(()) } } impl CollationKeySink for VecDeque { type Error = Infallible; type State = (); type Output = (); fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> { self.extend(buf.iter()); Ok(()) } fn finish(&mut self, _: Self::State) -> Result { Ok(()) } } impl CollationKeySink for SmallVec<[u8; N]> { type Error = Infallible; type State = (); type Output = (); fn write(&mut self, _: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> { self.extend_from_slice(buf); Ok(()) } fn finish(&mut self, _: Self::State) -> Result { Ok(()) } } impl CollationKeySink for [u8] { type Error = TooSmall; type State = usize; type Output = usize; fn write(&mut self, offset: &mut Self::State, buf: &[u8]) -> Result<(), Self::Error> { if *offset + buf.len() <= self.len() { // just checked bounds #[expect(clippy::indexing_slicing)] self[*offset..*offset + buf.len()].copy_from_slice(buf); } *offset += buf.len(); Ok(()) } fn finish(&mut self, offset: Self::State) -> Result { if offset <= self.len() { Ok(offset) } else { Err(TooSmall::new(offset)) } } } #[derive(Default)] struct SortKeyLevel { buf: SmallVec<[u8; 40]>, } impl SortKeyLevel { fn len(&self) -> usize { self.buf.len() } fn is_empty(&self) -> bool { self.buf.is_empty() } fn append_byte(&mut self, x: u8) { self.buf.push(x); } fn append_weight_16(&mut self, w: u16) { debug_assert_ne!(w, 0); let b0 = (w >> 8) as u8; let b1 = w as u8; self.append_byte(b0); if b1 != 0 { self.append_byte(b1); } } fn append_reverse_weight_16(&mut self, w: u16) { debug_assert_ne!(w, 0); let b0 = (w >> 8) as u8; let b1 = w as u8; if b1 != 0 { self.append_byte(b1); } self.append_byte(b0); } fn append_weight_32(&mut self, w: u32) { debug_assert_ne!(w, 0); let b0 = (w >> 24) as u8; let b1 = (w >> 16) as u8; let b2 = (w >> 8) as u8; let b3 = w as u8; self.append_byte(b0); if b1 != 0 { self.append_byte(b1); if b2 != 0 { self.append_byte(b2); if b3 != 0 { self.append_byte(b3); } } } } } // The algorithm below (BOCSU or Binary Ordered Compression Scheme for Unicode) is translated // from the C++ code in ICU4C at icu4c/source/i18n/bocsu.{cpp,h}. The algorithm works by // converting a sequence of codepoints into a sequence of presumably small differences. See // the C++ code for a more detailed explanation. macro_rules! negdivmod { ($n:ident, $d:ident, $m:ident) => { $m = $n % $d; $n /= $d; if $m < 0 { $n -= 1; $m += $d; } }; } fn write_diff(mut diff: i32, sink: &mut S, state: &mut S::State) -> Result<(), S::Error> where S: CollationKeySink + ?Sized, { let mut out = |b| sink.write_byte(state, b); if diff >= SLOPE_REACH_NEG_1 { if diff <= SLOPE_REACH_POS_1 { out((SLOPE_MIDDLE + diff) as _)?; } else if diff <= SLOPE_REACH_POS_2 { out((SLOPE_START_POS_2 + (diff / SLOPE_TAIL_COUNT)) as _)?; out((SLOPE_MIN + diff % SLOPE_TAIL_COUNT) as _)?; } else if diff <= SLOPE_REACH_POS_3 { let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT; diff /= SLOPE_TAIL_COUNT; let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT; let p0 = SLOPE_START_POS_3 + (diff / SLOPE_TAIL_COUNT); out(p0 as _)?; out(p1 as _)?; out(p2 as _)?; } else { let p3 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT; diff /= SLOPE_TAIL_COUNT; let p2 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT; diff /= SLOPE_TAIL_COUNT; let p1 = SLOPE_MIN + diff % SLOPE_TAIL_COUNT; out(SLOPE_MAX as _)?; out(p1 as _)?; out(p2 as _)?; out(p3 as _)?; } } else { let mut m; if diff >= SLOPE_REACH_NEG_2 { negdivmod!(diff, SLOPE_TAIL_COUNT, m); out((SLOPE_START_NEG_2 + diff) as _)?; out((SLOPE_MIN + m) as _)?; } else if diff >= SLOPE_REACH_NEG_3 { negdivmod!(diff, SLOPE_TAIL_COUNT, m); let p2 = SLOPE_MIN + m; negdivmod!(diff, SLOPE_TAIL_COUNT, m); let p1 = SLOPE_MIN + m; let p0 = SLOPE_START_NEG_3 + diff; out(p0 as _)?; out(p1 as _)?; out(p2 as _)?; } else { negdivmod!(diff, SLOPE_TAIL_COUNT, m); let p3 = SLOPE_MIN + m; negdivmod!(diff, SLOPE_TAIL_COUNT, m); let p2 = SLOPE_MIN + m; negdivmod!(diff, SLOPE_TAIL_COUNT, m); let p1 = SLOPE_MIN + m; let _ = diff; out(SLOPE_MIN as _)?; out(p1 as _)?; out(p2 as _)?; out(p3 as _)?; } } Ok(()) } fn write_identical_level(iter: I, sink: &mut S, state: &mut S::State) -> Result<(), S::Error> where I: Iterator, S: CollationKeySink + ?Sized, { let mut prev = 0i32; for c in iter { if !(0x4e00..=0xa000).contains(&prev) { prev = (prev & !0x7f) - SLOPE_REACH_NEG_1; } else { // Unihan U+4e00..U+9fa5: double-bytes down from the upper end prev = 0x9fff - SLOPE_REACH_POS_2; } if c == MERGE_SEPARATOR { sink.write_byte(state, MERGE_SEPARATOR_BYTE)?; prev = 0; } else { let c = c as i32; write_diff(c - prev, sink, state)?; prev = c; } } Ok(()) } #[cfg(test)] mod test { use super::*; use icu_locale::locale; type Key = Vec; fn collator_en(strength: Strength) -> CollatorBorrowed<'static> { let locale = locale!("en").into(); let mut options = CollatorOptions::default(); options.strength = Some(strength); Collator::try_new(locale, options).unwrap() } fn collator_en_case_level(strength: Strength) -> CollatorBorrowed<'static> { let locale = locale!("en").into(); let mut options = CollatorOptions::default(); options.strength = Some(strength); options.case_level = Some(crate::options::CaseLevel::On); Collator::try_new(locale, options).unwrap() } fn keys(strength: Strength) -> (Key, Key, Key) { let collator = collator_en(strength); let mut k0 = Vec::new(); let Ok(()) = collator.write_sort_key_to("aabc", &mut k0); let mut k1 = Vec::new(); let Ok(()) = collator.write_sort_key_to("aAbc", &mut k1); let mut k2 = Vec::new(); let Ok(()) = collator.write_sort_key_to("áAbc", &mut k2); (k0, k1, k2) } #[test] fn sort_key_primary() { let (k0, k1, k2) = keys(Strength::Primary); assert_eq!(k0, k1); assert_eq!(k1, k2); } #[test] fn sort_key_secondary() { let (k0, k1, k2) = keys(Strength::Secondary); assert_eq!(k0, k1); assert!(k1 < k2); } #[test] fn sort_key_tertiary() { let (k0, k1, k2) = keys(Strength::Tertiary); assert!(k0 < k1); assert!(k1 < k2); } fn collator_ja(strength: Strength) -> CollatorBorrowed<'static> { let locale = locale!("ja").into(); let mut options = CollatorOptions::default(); options.strength = Some(strength); Collator::try_new(locale, options).unwrap() } fn keys_ja_strs(strength: Strength, s0: &str, s1: &str) -> (Key, Key) { let collator = collator_ja(strength); let mut k0 = Vec::new(); let Ok(()) = collator.write_sort_key_to(s0, &mut k0); let mut k1 = Vec::new(); let Ok(()) = collator.write_sort_key_to(s1, &mut k1); (k0, k1) } fn keys_ja(strength: Strength) -> (Key, Key) { keys_ja_strs(strength, "あ", "ア") } #[test] fn sort_keys_ja_to_quaternary() { let (k0, k1) = keys_ja(Strength::Primary); assert_eq!(k0, k1); let (k0, k1) = keys_ja(Strength::Secondary); assert_eq!(k0, k1); let (k0, k1) = keys_ja(Strength::Tertiary); assert_eq!(k0, k1); let (k0, k1) = keys_ja(Strength::Quaternary); assert!(k0 < k1); } #[test] fn sort_keys_ja_identical() { let (k0, k1) = keys_ja_strs(Strength::Quaternary, "ア", "ア"); assert_eq!(k0, k1); let (k0, k1) = keys_ja_strs(Strength::Identical, "ア", "ア"); assert!(k0 < k1); } #[test] fn sort_keys_utf16() { let collator = collator_en(Strength::Identical); const STR8: &[u8] = b"hello world!"; let mut k8 = Vec::new(); let Ok(()) = collator.write_sort_key_utf8_to(STR8, &mut k8); const STR16: &[u16] = &[ 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, ]; let mut k16 = Vec::new(); let Ok(()) = collator.write_sort_key_utf16_to(STR16, &mut k16); assert_eq!(k8, k16); } #[test] fn sort_keys_invalid() { let collator = collator_en(Strength::Identical); // some invalid strings let mut k = Vec::new(); let Ok(()) = collator.write_sort_key_utf8_to(b"\xf0\x90", &mut k); let mut k = Vec::new(); let Ok(()) = collator.write_sort_key_utf16_to(&[0xdd1e], &mut k); } #[test] fn sort_key_to_vecdeque() { let collator = collator_en(Strength::Identical); let mut k0 = Vec::new(); let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0); let mut k1 = VecDeque::new(); let Ok(()) = collator.write_sort_key_to("áAbc", &mut k1); assert!(k0.iter().eq(k1.iter())); } #[test] fn sort_key_to_slice() { let collator = collator_en(Strength::Identical); let mut k0 = Vec::new(); let Ok(()) = collator.write_sort_key_to("áAbc", &mut k0); let mut k1 = [0u8; 100]; let len = collator.write_sort_key_to("áAbc", &mut k1[..]).unwrap(); assert_eq!(len, k0.len()); assert!(k0.iter().eq(k1[..len].iter())); } #[test] fn sort_key_to_slice_no_space() { let collator = collator_en(Strength::Identical); let mut k = [0u8; 0]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); assert!(matches!(res, Err(TooSmall { .. }))); } #[test] fn sort_key_to_slice_too_long() { // This runs out of space in write_sort_key_up_to_quaternary. let collator = collator_en(Strength::Identical); let mut k = [0u8; 5]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); assert!(matches!(res, Err(TooSmall { .. }))); } #[test] fn sort_key_to_slice_identical_too_long() { // This runs out of space while appending UTF-8 in the SinkAdapter. let collator = collator_en(Strength::Identical); let mut k = [0u8; 22]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); assert!(matches!(res, Err(TooSmall { .. }))); } #[test] fn sort_key_just_right() { // get the length needed let collator = collator_en(Strength::Identical); let mut k = [0u8; 0]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); let len = res.unwrap_err().length; // almost enough let mut k = vec![0u8; len - 1]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); let len = res.unwrap_err().length; // just right let mut k = vec![0u8; len]; let res = collator.write_sort_key_to("áAbc", &mut k[..]); assert_eq!(res, Ok(len)); } #[test] fn sort_key_utf16_slice_too_small() { let collator = collator_en(Strength::Identical); const STR16: &[u16] = &[0x68, 0x65, 0x6c, 0x6c, 0x6f]; let mut k = [0u8; 4]; let res = collator.write_sort_key_utf16_to(STR16, &mut k[..]); assert!(matches!(res, Err(TooSmall { .. }))); } #[test] fn sort_key_very_long() { let collator = collator_en(Strength::Secondary); let mut k = Vec::new(); let Ok(()) = collator.write_sort_key_to(&"a".repeat(300), &mut k); } #[test] fn sort_key_case_level() { let collator = collator_en_case_level(Strength::Tertiary); let mut k = Vec::new(); let Ok(()) = collator.write_sort_key_to("aBc", &mut k); } #[test] fn sort_key_case_level_empty() { let collator = collator_en_case_level(Strength::Tertiary); let mut k = Vec::new(); let Ok(()) = collator.write_sort_key_to("", &mut k); } fn check_sort_key_less(a: &[u16], b: &[u16]) { let collator = collator_en(Strength::Identical); let mut ak = Vec::new(); let Ok(()) = collator.write_sort_key_utf16_to(a, &mut ak); let mut bk = Vec::new(); let Ok(()) = collator.write_sort_key_utf16_to(b, &mut bk); assert!(ak < bk, "failed: {a:04x?} - {b:04x?}"); } #[test] fn sort_key_fffe_bug_6811() { check_sort_key_less( &[0xfffe, 0x0001, 0x0002, 0x0003], &[0x0001, 0xfffe, 0x0002, 0x0003], ); check_sort_key_less( &[0x0001, 0xfffe, 0x0002, 0x0003], &[0x0001, 0x0002, 0xfffe, 0x0003], ); check_sort_key_less( &[0x0001, 0x0002, 0xfffe, 0x0003], &[0x0001, 0x0002, 0x0003, 0xfffe], ); check_sort_key_less(&[0xfffe, 0x0000, 0x0000], &[0x0000, 0xfffe, 0x0000]); check_sort_key_less(&[0x0000, 0xfffe, 0x0000], &[0x0000, 0x0000, 0xfffe]); } }