pub fn slide_hash(state: &mut crate::deflate::State) { let wsize = state.w_size as u16; // The state.head and state.prev slices have a length that is a power of 2 between 8 and 16. // That knowledge means `chunks_exact` with a (small) power of 2 can be used without risk of // missing elements. slide_hash_chain(state.head.as_mut_slice(), wsize); slide_hash_chain(state.prev.as_mut_slice(), wsize); } fn slide_hash_chain(table: &mut [u16], wsize: u16) { #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2_and_bmi2() { // SAFETY: the avx2 and bmi2 target feature are enabled. return unsafe { avx2::slide_hash_chain(table, wsize) }; } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { return unsafe { neon::slide_hash_chain(table, wsize) }; } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { // SAFETY: the simd128 target feature is enabled. return unsafe { wasm::slide_hash_chain(table, wsize) }; } rust::slide_hash_chain(table, wsize); } #[inline(always)] fn generic_slide_hash_chain(table: &mut [u16], wsize: u16) { debug_assert_eq!(table.len() % N, 0); for chunk in table.chunks_exact_mut(N) { for m in chunk.iter_mut() { *m = m.saturating_sub(wsize); } } } mod rust { pub fn slide_hash_chain(table: &mut [u16], wsize: u16) { // 32 means that 4 128-bit values can be processed per iteration. That appear to be the // optimal amount on x86_64 (SSE) and aarch64 (NEON). super::generic_slide_hash_chain::<32>(table, wsize); } } #[cfg(target_arch = "x86_64")] mod avx2 { /// # Safety /// /// Behavior is undefined if the `avx2` target feature is not enabled #[target_feature(enable = "avx2")] #[target_feature(enable = "bmi2")] #[target_feature(enable = "bmi1")] pub unsafe fn slide_hash_chain(table: &mut [u16], wsize: u16) { // 64 means that 4 256-bit values can be processed per iteration. // That appear to be the optimal amount for avx2. // // This vectorizes well https://godbolt.org/z/sGbdYba7K super::generic_slide_hash_chain::<64>(table, wsize); } } #[cfg(target_arch = "aarch64")] mod neon { /// # Safety /// /// Behavior is undefined if the `neon` target feature is not enabled #[target_feature(enable = "neon")] pub unsafe fn slide_hash_chain(table: &mut [u16], wsize: u16) { // 32 means that 4 128-bit values can be processed per iteration. That appear to be the // optimal amount for neon. super::generic_slide_hash_chain::<32>(table, wsize); } } #[cfg(target_arch = "wasm32")] mod wasm { /// # Safety /// /// Behavior is undefined if the `simd128` target feature is not enabled #[target_feature(enable = "simd128")] pub unsafe fn slide_hash_chain(table: &mut [u16], wsize: u16) { // 32 means that 4 128-bit values can be processed per iteration. That appear to be the // optimal amount on x86_64 (SSE) and aarch64 (NEON), which is what this will ultimately // compile down to. super::generic_slide_hash_chain::<32>(table, wsize); } } #[cfg(test)] mod tests { use super::*; const WSIZE: u16 = 32768; const INPUT: [u16; 64] = [ 0, 0, 28790, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43884, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64412, 0, 0, 0, 0, 0, 21043, 0, 0, 0, 0, 0, 23707, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64026, 0, 0, 20182, ]; const OUTPUT: [u16; 64] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31644, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31258, 0, 0, 0, ]; #[test] fn test_slide_hash_rust() { let mut input = INPUT; rust::slide_hash_chain(&mut input, WSIZE); assert_eq!(input, OUTPUT); } #[test] #[cfg(target_arch = "x86_64")] fn test_slide_hash_avx2() { if crate::cpu_features::is_enabled_avx2_and_bmi2() { let mut input = INPUT; unsafe { avx2::slide_hash_chain(&mut input, WSIZE) }; assert_eq!(input, OUTPUT); } } #[test] #[cfg(target_arch = "aarch64")] fn test_slide_hash_neon() { if crate::cpu_features::is_enabled_neon() { let mut input = INPUT; unsafe { neon::slide_hash_chain(&mut input, WSIZE) }; assert_eq!(input, OUTPUT); } } #[test] #[cfg(target_arch = "wasm32")] fn test_slide_hash_wasm() { if crate::cpu_features::is_enabled_simd128() { let mut input = INPUT; unsafe { wasm::slide_hash_chain(&mut input, WSIZE) }; assert_eq!(input, OUTPUT); } } quickcheck::quickcheck! { fn slide_is_rust_slide(v: Vec, wsize: u16) -> bool { // pad to a multiple of 64 (the biggest chunk size currently in use) let difference = v.len().next_multiple_of(64) - v.len(); let mut v = v; v.extend(core::iter::repeat(u16::MAX).take(difference)); let mut a = v.clone(); let mut b = v; rust::slide_hash_chain(&mut a, wsize); slide_hash_chain(&mut b, wsize); a == b } } }