/// An inner implementation of a HashMap-like container with open addressing. /// Designed to be used in HashMap, HashSet, HashMultiMap. /// The load factor is 25%-50%. /// Uses RustC FxHasher as a hash function. /// A default value is used to mark empty slots, so it can't be used as a key. /// Inspired by https://source.chromium.org/chromium/chromium/src/+/main:components/url_pattern_index/closed_hash_map.h use std::marker::PhantomData; use crate::flatbuffers::containers::fb_index::FbIndex; /// A trait for hash table builder keys, i.e. String. /// The default value is used to mark empty slots. pub(crate) trait HashKey: Eq + std::hash::Hash + Default + Clone { /// Returns true if the key is empty. fn is_empty(&self) -> bool; } impl HashKey for T { fn is_empty(&self) -> bool { self == &T::default() } } /// A trait for hash table view keys that can be used in flatbuffers, i.e. &str. /// The implementation must synchronized with matching HashKey trait. pub(crate) trait FbHashKey: Eq + std::hash::Hash { /// Returns true if the key is empty. fn is_empty(&self) -> bool; } impl FbHashKey for &str { fn is_empty(&self) -> bool { str::is_empty(self) } } /// An internal function to find a slot in the hash table for the given key. /// Returns the slot index. /// 'table_size' is the table size. It must be a power of two. /// 'probe' must return true at least for one slot (supposing the table isn't full). pub fn find_slot( key: &I, table_size: usize, probe: impl Fn(usize) -> bool, ) -> usize { debug_assert!(table_size.is_power_of_two()); let table_mask = table_size - 1; let mut slot = get_hash(&key) & table_mask; let mut step = 1; loop { if probe(slot) { return slot; } slot = (slot + step) & table_mask; step += 1; } } /// A flatbuffer-compatible view of a hash table. /// It's used to access the hash table without copying the keys and values. /// Is loaded from HashIndexBuilder data, serialized into a flatbuffer. pub(crate) struct HashIndexView, Values: FbIndex> { indexes: Keys, values: Values, _phantom_i: PhantomData, _phantom_v: PhantomData, } impl, Values: FbIndex> HashIndexView { pub fn new(indexes: Keys, values: Values) -> Self { Self { indexes, values, _phantom_i: PhantomData, _phantom_v: PhantomData, } } pub fn capacity(&self) -> usize { self.indexes.len() } pub fn get_single(&self, key: I) -> Option { let slot = find_slot(&key, self.capacity(), |slot| -> bool { FbHashKey::is_empty(&self.indexes.get(slot)) || self.indexes.get(slot) == key }); if FbHashKey::is_empty(&self.indexes.get(slot)) { None } else { Some(self.values.get(slot)) } } #[cfg(test)] /// Returns the number of non-empty slots in the hash table. /// Slow, use only for tests. pub fn len(&self) -> usize { let mut len = 0; for i in 0..self.capacity() { if !FbHashKey::is_empty(&self.indexes.get(i)) { len += 1; } } len } } /// A builder for a hash table. /// The default value is used to mark empty slots. /// `consume()` output is suppose to be serialized into a flatbuffer and /// used as a HashIndexView. pub(crate) struct HashIndexBuilder { indexes: Vec, values: Vec, size: usize, } /// An internal function to hash a key. /// The hash must be persistent across different runs of the program. fn get_hash(key: &I) -> usize { // RustC Hash is 2x faster than DefaultHasher. use rustc_hash::FxHasher; use std::hash::Hasher; let mut hasher = FxHasher::default(); key.hash(&mut hasher); hasher.finish() as usize } impl Default for HashIndexBuilder { fn default() -> Self { Self::new_with_capacity(4) } } impl HashIndexBuilder { pub fn new_with_capacity(capacity: usize) -> Self { Self { size: 0, indexes: vec![I::default(); capacity], values: vec![V::default(); capacity], } } pub fn insert(&mut self, key: I, value: V, allow_duplicates: bool) -> (usize, &mut V) { debug_assert!(!HashKey::is_empty(&key), "Key is empty"); let slot = find_slot(&key, self.capacity(), |slot| -> bool { HashKey::is_empty(&self.indexes[slot]) || (self.indexes[slot] == key && !allow_duplicates) }); if HashKey::is_empty(&self.indexes[slot]) { self.indexes[slot] = key; self.values[slot] = value; self.size += 1; self.maybe_increase_capacity(); (slot, &mut self.values[slot]) } else { self.values[slot] = value; (slot, &mut self.values[slot]) } } fn capacity(&self) -> usize { self.indexes.len() } pub fn get_or_insert(&mut self, key: I, value: V) -> &mut V { let slot = find_slot(&key, self.capacity(), |slot| -> bool { HashKey::is_empty(&self.indexes[slot]) || self.indexes[slot] == key }); if !HashKey::is_empty(&self.indexes[slot]) { return &mut self.values[slot]; } let (_, new_value) = self.insert(key, value, false); new_value } fn maybe_increase_capacity(&mut self) { if self.size * 2 <= self.capacity() { // Use 50% load factor. return; } let new_capacity = (self.capacity() * 2).next_power_of_two(); let old_indexes = std::mem::take(&mut self.indexes); let old_values = std::mem::take(&mut self.values); self.indexes = vec![I::default(); new_capacity]; self.values = vec![V::default(); new_capacity]; for (key, value) in old_indexes.into_iter().zip(old_values.into_iter()) { if !HashKey::is_empty(&key) { let slot = find_slot(&key, new_capacity, |slot| -> bool { HashKey::is_empty(&self.indexes[slot]) }); self.indexes[slot] = key; self.values[slot] = value; } } } pub fn consume(value: Self) -> (Vec, Vec) { (value.indexes, value.values) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_get_hash() { // Verify get_hash is stable. // If the value changes, update ADBLOCK_RUST_DAT_VERSION. let message = "If the value changes, update ADBLOCK_RUST_DAT_VERSION."; assert_eq!(get_hash(&"adblock-rust"), 15102204115509201409, "{message}"); } }