// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::grapheme::*; use crate::indices::Utf16Indices; use crate::provider::*; use crate::scaffold::{Utf16, Utf8}; use core::str::CharIndices; use icu_collections::char16trie::{Char16Trie, TrieResult}; /// A trait for dictionary based iterator trait DictionaryType { /// The iterator over characters. type IterAttr<'s>: Iterator + Clone; /// The character type. type CharType: Copy + Into; fn to_char(c: Self::CharType) -> char; fn char_len(c: Self::CharType) -> usize; } struct DictionaryBreakIterator< 'l, 's, Y: DictionaryType + ?Sized, X: Iterator + ?Sized, > { trie: Char16Trie<'l>, iter: Y::IterAttr<'s>, len: usize, grapheme_iter: X, // TODO transform value for byte trie } /// Implement the [`Iterator`] trait over the segmenter break opportunities of the given string. /// Please see the [module-level documentation](crate) for its usages. /// /// Lifetimes: /// - `'l` = lifetime of the segmenter object from which this iterator was created /// - `'s` = lifetime of the string being segmented /// /// [`Iterator`]: core::iter::Iterator impl + ?Sized> Iterator for DictionaryBreakIterator<'_, '_, Y, X> { type Item = usize; fn next(&mut self) -> Option { let mut trie_iter = self.trie.iter(); let mut intermediate_length = 0; let mut not_match = false; let mut previous_match = None; let mut last_grapheme_offset = 0; while let Some(next) = self.iter.next() { let ch = Y::to_char(next.1); match trie_iter.next(ch) { TrieResult::FinalValue(_) => { return Some(next.0 + Y::char_len(next.1)); } TrieResult::Intermediate(_) => { // Dictionary has to match with grapheme cluster segment. // If not, we ignore it. while last_grapheme_offset < next.0 + Y::char_len(next.1) { if let Some(offset) = self.grapheme_iter.next() { last_grapheme_offset = offset; continue; } last_grapheme_offset = self.len; break; } if last_grapheme_offset != next.0 + Y::char_len(next.1) { continue; } intermediate_length = next.0 + Y::char_len(next.1); previous_match = Some(self.iter.clone()); } TrieResult::NoMatch => { if intermediate_length > 0 { if let Some(previous_match) = previous_match { // Rewind previous match point self.iter = previous_match; } return Some(intermediate_length); } // Not found return Some(next.0 + Y::char_len(next.1)); } TrieResult::NoValue => { // Prefix string is matched not_match = true; } } } if intermediate_length > 0 { Some(intermediate_length) } else if not_match { // no match by scanning text Some(self.len) } else { None } } } impl DictionaryType for u32 { type IterAttr<'s> = Utf16Indices<'s>; type CharType = u32; fn to_char(c: u32) -> char { char::from_u32(c).unwrap_or(char::REPLACEMENT_CHARACTER) } fn char_len(c: u32) -> usize { if c >= 0x10000 { 2 } else { 1 } } } impl DictionaryType for char { type IterAttr<'s> = CharIndices<'s>; type CharType = char; fn to_char(c: char) -> char { c } fn char_len(c: char) -> usize { c.len_utf8() } } pub(super) struct DictionarySegmenter<'l> { dict: &'l UCharDictionaryBreakData<'l>, grapheme: GraphemeClusterSegmenterBorrowed<'l>, } impl<'l> DictionarySegmenter<'l> { pub(super) fn new( dict: &'l UCharDictionaryBreakData<'l>, grapheme: GraphemeClusterSegmenterBorrowed<'l>, ) -> Self { // TODO: no way to verify trie data Self { dict, grapheme } } /// Create a dictionary based break iterator for an `str` (a UTF-8 string). pub(super) fn segment_str(&'l self, input: &'l str) -> impl Iterator + 'l { let grapheme_iter = self.grapheme.segment_str(input); DictionaryBreakIterator::> { trie: Char16Trie::new(self.dict.trie_data.clone()), iter: input.char_indices(), len: input.len(), grapheme_iter, } } /// Create a dictionary based break iterator for a UTF-16 string. pub(super) fn segment_utf16(&'l self, input: &'l [u16]) -> impl Iterator + 'l { let grapheme_iter = self.grapheme.segment_utf16(input); DictionaryBreakIterator::> { trie: Char16Trie::new(self.dict.trie_data.clone()), iter: Utf16Indices::new(input), len: input.len(), grapheme_iter, } } } #[cfg(test)] #[cfg(feature = "serde")] mod tests { use super::*; use crate::{GraphemeClusterSegmenter, LineSegmenter, WordSegmenter}; use icu_provider::prelude::*; #[test] fn burmese_dictionary_test() { let segmenter = LineSegmenter::new_dictionary(Default::default()); // From css/css-text/word-break/word-break-normal-my-000.html let s = "မြန်မာစာမြန်မာစာမြန်မာစာ"; let result: Vec = segmenter.segment_str(s).collect(); assert_eq!(result, vec![0, 18, 24, 42, 48, 66, 72]); let s_utf16: Vec = s.encode_utf16().collect(); let result: Vec = segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![0, 6, 8, 14, 16, 22, 24]); } #[test] fn cj_dictionary_test() { let response: DataResponse = crate::provider::Baked .load(DataRequest { id: DataIdentifierBorrowed::for_marker_attributes( DataMarkerAttributes::from_str_or_panic("cjdict"), ), ..Default::default() }) .unwrap(); let word_segmenter = WordSegmenter::new_dictionary(Default::default()); let dict_segmenter = DictionarySegmenter::new(response.payload.get(), GraphemeClusterSegmenter::new()); // Match case let s = "龟山岛龟山岛"; let result: Vec = dict_segmenter.segment_str(s).collect(); assert_eq!(result, vec![9, 18]); let result: Vec = word_segmenter.segment_str(s).collect(); assert_eq!(result, vec![0, 9, 18]); let s_utf16: Vec = s.encode_utf16().collect(); let result: Vec = dict_segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![3, 6]); let result: Vec = word_segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![0, 3, 6]); // Match case, then no match case let s = "エディターエディ"; let result: Vec = dict_segmenter.segment_str(s).collect(); assert_eq!(result, vec![15, 24]); // TODO(#3236): Why is WordSegmenter not returning the middle segment? let result: Vec = word_segmenter.segment_str(s).collect(); assert_eq!(result, vec![0, 24]); let s_utf16: Vec = s.encode_utf16().collect(); let result: Vec = dict_segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![5, 8]); // TODO(#3236): Why is WordSegmenter not returning the middle segment? let result: Vec = word_segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![0, 8]); } #[test] fn khmer_dictionary_test() { let segmenter = LineSegmenter::new_dictionary(Default::default()); let s = "ភាសាខ្មែរភាសាខ្មែរភាសាខ្មែរ"; let result: Vec = segmenter.segment_str(s).collect(); assert_eq!(result, vec![0, 27, 54, 81]); let s_utf16: Vec = s.encode_utf16().collect(); let result: Vec = segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(result, vec![0, 9, 18, 27]); } #[test] fn lao_dictionary_test() { let segmenter = LineSegmenter::new_dictionary(Default::default()); let s = "ພາສາລາວພາສາລາວພາສາລາວ"; let r: Vec = segmenter.segment_str(s).collect(); assert_eq!(r, vec![0, 12, 21, 33, 42, 54, 63]); let s_utf16: Vec = s.encode_utf16().collect(); let r: Vec = segmenter.segment_utf16(&s_utf16).collect(); assert_eq!(r, vec![0, 4, 7, 11, 14, 18, 21]); } }