// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). //! This is the main module pertaining to casemapping exceptions. //! //! A single exception is represented by the [`Exception`] type and its ULE equivalent. //! //! The storage format is complicated (and documented on [`Exception`]), but the data format is //! represented equally by [`DecodedException`], which is more human-readable. use icu_provider::prelude::*; use super::data::MappingKind; use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence}; use crate::set::ClosureSink; use alloc::borrow::Cow; use core::fmt; #[cfg(any(feature = "serde", feature = "datagen"))] use core::ops::Range; use core::ptr; use zerovec::ule::AsULE; use zerovec::VarZeroVec; const SURROGATES_START: u32 = 0xD800; const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1; /// This represents case mapping exceptions that can't be represented as a delta applied to /// the original code point. The codepoint /// trie in CaseMapper stores indices into this VarZeroVec. /// ///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, /// including in SemVer minor releases. While the serde representation of data structs is guaranteed /// to be stable, their Rust representation might not be. Use with caution. ///
#[cfg_attr(feature = "serde", derive(serde::Deserialize))] #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] #[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))] #[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] pub struct CaseMapExceptions<'data> { #[cfg_attr(feature = "serde", serde(borrow))] /// The list of exceptions pub exceptions: VarZeroVec<'data, ExceptionULE>, } impl CaseMapExceptions<'_> { /// Obtain the exception at index `idx`. Will /// return a default value if not present (GIGO behavior), /// as these indices should come from a paired CaseMapData object /// /// Will also panic in debug mode pub fn get(&self, idx: u16) -> &ExceptionULE { let exception = self.exceptions.get(idx.into()); debug_assert!(exception.is_some()); exception.unwrap_or(ExceptionULE::empty_exception()) } #[cfg(any(feature = "serde", feature = "datagen"))] pub(crate) fn validate(&self) -> Result, &'static str> { for exception in self.exceptions.iter() { exception.validate()?; } u16::try_from(self.exceptions.len()) .map_err(|_| "Too many exceptions") .map(|l| 0..l) } } /// A type representing the wire format of `Exception`. The data contained is /// equivalently represented by [`DecodedException`]. /// /// This type is itself not used that much, most of its relevant methods live /// on [`ExceptionULE`]. /// /// The `bits` contain supplementary data, whereas /// `slot_presence` marks te presence of various extra data /// in the `data` field. /// /// The `data` field is not validated to contain all of this data, /// this type will have GIGO behavior when constructed with invalid `data`. /// /// The format of `data` is documented on the field /// ///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, /// including in SemVer minor releases. While the serde representation of data structs is guaranteed /// to be stable, their Rust representation might not be. Use with caution. ///
#[zerovec::make_varule(ExceptionULE)] #[derive(PartialEq, Eq, Clone, Default, Debug)] #[zerovec::skip_derive(Ord)] #[cfg_attr( feature = "serde", derive(serde::Deserialize), zerovec::derive(Deserialize) )] #[cfg_attr( feature = "datagen", derive(serde::Serialize), zerovec::derive(Serialize) )] pub struct Exception<'a> { /// The various bit based exception data associated with this. /// /// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more. pub bits: ExceptionBits, /// Which slots are present in `data`. /// /// Format: a u8 of bitflags pub slot_presence: SlotPresence, /// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]` /// /// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping), /// this will have one entry in the string, packed together. /// /// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s /// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars /// starting from 0xE000. The sign is stored in bits.negative_delta. /// /// If both Closure/FullMapping are present, the next char will be the length of the closure slot, /// bisecting the rest of the data. /// If only one is present, the rest of the data represents that slot. /// /// The closure slot simply represents one string. The full-mappings slot represents four strings, /// packed in a way similar to VarZeroVec, in the following format: /// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]` /// /// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in /// the order corresponding to the MappingKind enum. pub data: Cow<'a, str>, } impl ExceptionULE { #[inline] fn empty_exception() -> &'static Self { static EMPTY_BYTES: &[u8] = &[0, 0]; // Safety: // ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s // // An "empty" one can be constructed from a slice of two u8s unsafe { let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0); &*(slice as *const Self) } } pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool { self.slot_presence.has_slot(slot) } /// Obtain a `char` slot, if occupied. If `slot` represents a string slot, /// will return `None` pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option { if slot >= ExceptionSlot::STRING_SLOTS_START { return None; } let bit = 1 << (slot as u8); // check if slot is occupied if self.slot_presence.0 & bit == 0 { return None; } let previous_slot_mask = bit - 1; let previous_slots = self.slot_presence.0 & previous_slot_mask; let slot_num = previous_slots.count_ones() as usize; self.data.chars().nth(slot_num) } /// Get the `simple_case` delta (i.e. the `delta` slot), given the character /// this data belongs to. /// /// Normalizes the delta from char-format to u32 format /// /// Does *not* handle the sign of the delta; see self.bits.negative_delta fn get_simple_case_delta(&self) -> Option { let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?; let mut delta = u32::from(delta_ch); // We "fill in" the surrogates range by offsetting deltas greater than it if delta >= SURROGATES_START { delta -= SURROGATES_LEN; } Some(delta) } /// Get the `simple_case` value (i.e. the `delta` slot), given the character /// this data belongs to. /// /// The data is stored as a delta so the character must be provided. /// /// The data cannot be stored directly as a character because the trie is more /// compact with adjacent characters sharing deltas. pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option { let delta = self.get_simple_case_delta()?; let mut delta = i32::try_from(delta).ok()?; if self.bits.negative_delta() { delta = -delta; } let new_ch = i32::try_from(u32::from(ch)).ok()? + delta; char::try_from(u32::try_from(new_ch).ok()?).ok() } /// Returns *all* the data in the closure/full slots, including length metadata fn get_stringy_data(&self) -> Option<&str> { const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1; let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize; let mut chars = self.data.chars(); for _ in 0..char_slot_count { let res = chars.next(); res?; } Some(chars.as_str()) } /// Returns a single stringy slot, either ExceptionSlot::Closure /// or ExceptionSlot::FullMappings. fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> { debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings); let other_slot = if slot == ExceptionSlot::Closure { ExceptionSlot::FullMappings } else { ExceptionSlot::Closure }; if !self.slot_presence.has_slot(slot) { return None; } let stringy_data = self.get_stringy_data()?; if self.slot_presence.has_slot(other_slot) { // both stringy slots are used, we need a length let mut chars = stringy_data.chars(); // GIGO: to have two strings there must be a length, if not present return None let length_char = chars.next()?; let length = usize::try_from(u32::from(length_char)).unwrap_or(0); // The length indexes into the string after the first char let remaining_slice = chars.as_str(); // GIGO: will return none if there wasn't enough space in this slot if slot == ExceptionSlot::Closure { remaining_slice.get(0..length) } else { remaining_slice.get(length..) } } else { // only a single stringy slot, there is no length stored Some(stringy_data) } } /// Get the data behind the `closure` slot pub(crate) fn get_closure_slot(&self) -> Option<&str> { self.get_stringy_slot(ExceptionSlot::Closure) } /// Get all the slot data for the FullMappings slot /// /// This needs to be further segmented into four based on length metadata fn get_fullmappings_slot_data(&self) -> Option<&str> { self.get_stringy_slot(ExceptionSlot::FullMappings) } /// Get a specific FullMappings slot value pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> { let data = self.get_fullmappings_slot_data()?; let mut chars = data.chars(); // GIGO: must have three index strings, else return None let i1 = usize::try_from(u32::from(chars.next()?)).ok()?; let i2 = usize::try_from(u32::from(chars.next()?)).ok()?; let i3 = usize::try_from(u32::from(chars.next()?)).ok()?; let remaining_slice = chars.as_str(); // GIGO: if the indices are wrong, return None match kind { MappingKind::Lower => remaining_slice.get(..i1), MappingKind::Fold => remaining_slice.get(i1..i2), MappingKind::Upper => remaining_slice.get(i2..i3), MappingKind::Title => remaining_slice.get(i3..), } } // convenience function that lets us use the ? operator fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> { Some([ self.get_fullmappings_slot_for_kind(MappingKind::Lower)? .into(), self.get_fullmappings_slot_for_kind(MappingKind::Fold)? .into(), self.get_fullmappings_slot_for_kind(MappingKind::Upper)? .into(), self.get_fullmappings_slot_for_kind(MappingKind::Title)? .into(), ]) } // Given a mapping kind, returns the character for that kind, if it exists. Fold falls // back to Lower; Title falls back to Upper. #[inline] pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option { match kind { MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()), MappingKind::Fold => self .get_char_slot(ExceptionSlot::Fold) .or_else(|| self.get_char_slot(ExceptionSlot::Lower)), MappingKind::Title => self .get_char_slot(ExceptionSlot::Title) .or_else(|| self.get_char_slot(ExceptionSlot::Upper)), } } pub(crate) fn add_full_and_closure_mappings(&self, set: &mut S) { if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) { if !full.is_empty() { set.add_string(full); } }; if let Some(closure) = self.get_closure_slot() { for c in closure.chars() { set.add_char(c); } }; } /// Extract all the data out into a structured form /// /// Useful for serialization and debugging pub fn decode(&self) -> DecodedException<'_> { // Potential future optimization: This can // directly access each bit one after the other and iterate the string // which avoids recomputing slot offsets over and over again. // // If we're doing so we may wish to retain this older impl so that we can still roundtrip test let bits = self.bits; let lowercase = self.get_char_slot(ExceptionSlot::Lower); let casefold = self.get_char_slot(ExceptionSlot::Fold); let uppercase = self.get_char_slot(ExceptionSlot::Upper); let titlecase = self.get_char_slot(ExceptionSlot::Title); let simple_case_delta = self.get_simple_case_delta(); let closure = self.get_closure_slot().map(Into::into); let full = self.get_all_fullmapping_slots(); DecodedException { bits: ExceptionBits::from_unaligned(bits), lowercase, casefold, uppercase, titlecase, simple_case_delta, closure, full, } } #[cfg(any(feature = "serde", feature = "datagen"))] pub(crate) fn validate(&self) -> Result<(), &'static str> { // check that ICU4C specific fields are not set // check that there is enough space for all the offsets if self.bits.double_width_slots() { return Err("double-width-slots should not be used in ICU4C"); } // just run all of the slot getters at once and then check let decoded = self.decode(); for (slot, decoded_slot) in [ (ExceptionSlot::Lower, &decoded.lowercase), (ExceptionSlot::Fold, &decoded.casefold), (ExceptionSlot::Upper, &decoded.uppercase), (ExceptionSlot::Title, &decoded.titlecase), ] { if self.has_slot(slot) && decoded_slot.is_none() { // decoding hit GIGO behavior, oops! return Err("Slot decoding failed"); } } if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() { // decoding hit GIGO behavior, oops! return Err("Slot decoding failed"); } if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() { return Err("Slot decoding failed"); } if self.has_slot(ExceptionSlot::FullMappings) { if decoded.full.is_some() { let data = self .get_fullmappings_slot_data() .ok_or("fullmappings slot doesn't parse")?; let mut chars = data.chars(); let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?); let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?); let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?); if i2 < i1 || i3 < i2 { return Err("fullmappings string contains non-sequential indices"); } let rest = chars.as_str(); let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?; if i1 > len || i2 > len || i3 > len { return Err("fullmappings string contains out-of-bounds indices"); } } else { return Err("Slot decoding failed"); } } Ok(()) } } impl fmt::Debug for ExceptionULE { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.decode().fmt(f) } } /// A decoded [`Exception`] type, with all of the data parsed out into /// separate fields. /// ///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, /// including in SemVer minor releases. While the serde representation of data structs is guaranteed /// to be stable, their Rust representation might not be. Use with caution. ///
#[cfg_attr(feature = "serde", derive(serde::Deserialize))] #[cfg_attr(feature = "datagen", derive(serde::Serialize))] #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct DecodedException<'a> { /// The various bit-based data associated with this exception pub bits: ExceptionBits, /// Lowercase mapping pub lowercase: Option, /// Case folding pub casefold: Option, /// Uppercase mapping pub uppercase: Option, /// Titlecase mapping pub titlecase: Option, /// The simple casefold delta. Its sign is stored in bits.negative_delta pub simple_case_delta: Option, /// Closure mappings pub closure: Option>, /// The four full-mappings strings, indexed by MappingKind u8 value pub full: Option<[Cow<'a, str>; 4]>, } impl DecodedException<'_> { /// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`] pub fn encode(&self) -> Exception<'static> { let bits = self.bits; let mut slot_presence = SlotPresence(0); let mut data = alloc::string::String::new(); if let Some(lowercase) = self.lowercase { slot_presence.add_slot(ExceptionSlot::Lower); data.push(lowercase) } if let Some(casefold) = self.casefold { slot_presence.add_slot(ExceptionSlot::Fold); data.push(casefold) } if let Some(uppercase) = self.uppercase { slot_presence.add_slot(ExceptionSlot::Upper); data.push(uppercase) } if let Some(titlecase) = self.titlecase { slot_presence.add_slot(ExceptionSlot::Title); data.push(titlecase) } if let Some(mut simple_case_delta) = self.simple_case_delta { slot_presence.add_slot(ExceptionSlot::Delta); if simple_case_delta >= SURROGATES_START { simple_case_delta += SURROGATES_LEN; } let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0'); data.push(simple_case_delta) } if let Some(ref closure) = self.closure { slot_presence.add_slot(ExceptionSlot::Closure); if self.full.is_some() { // GIGO: if the closure length is more than 0xD800 this will error. Plenty of space. debug_assert!( closure.len() < 0xD800, "Found overlarge closure value when encoding exception" ); let len_char = u32::try_from(closure.len()) .ok() .and_then(|c| char::try_from(c).ok()) .unwrap_or('\0'); data.push(len_char); } data.push_str(closure); } if let Some(ref full) = self.full { slot_presence.add_slot(ExceptionSlot::FullMappings); let mut idx = 0; // iterate all elements except the last, whose length we can calculate from context for mapping in full.iter().take(3) { idx += mapping.len(); data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0')); } for mapping in full { data.push_str(mapping); } } Exception { bits, slot_presence, data: data.into(), } } // Potential optimization: Write an `EncodeAsVarULE` that // directly produces an ExceptionULE } #[cfg(test)] mod tests { use super::*; fn test_roundtrip_once(exception: DecodedException) { let encoded = exception.encode(); let encoded = zerovec::ule::encode_varule_to_box(&encoded); let decoded = encoded.decode(); assert_eq!(decoded, exception); } #[test] fn test_roundtrip() { test_roundtrip_once(DecodedException { lowercase: Some('ø'), ..Default::default() }); test_roundtrip_once(DecodedException { titlecase: Some('X'), lowercase: Some('ø'), ..Default::default() }); test_roundtrip_once(DecodedException { titlecase: Some('X'), ..Default::default() }); test_roundtrip_once(DecodedException { titlecase: Some('X'), simple_case_delta: Some(0xE999), closure: Some("hello world".into()), ..Default::default() }); test_roundtrip_once(DecodedException { simple_case_delta: Some(10), closure: Some("hello world".into()), full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]), ..Default::default() }); test_roundtrip_once(DecodedException { closure: Some("hello world".into()), full: Some(["aa".into(), "ț".into(), "".into(), "å".into()]), ..Default::default() }); test_roundtrip_once(DecodedException { full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]), ..Default::default() }); } }