use core::fmt; use core::mem::MaybeUninit; use core::ops::Range; use crate::cpu_features::CpuFeatures; use crate::weak_slice::WeakSliceMut; pub struct Writer<'a> { buf: WeakSliceMut<'a, MaybeUninit>, filled: usize, } impl<'a> Writer<'a> { /// Creates a new `Writer` from a fully initialized buffer. #[inline] pub fn new(buf: &'a mut [u8]) -> Writer<'a> { // SAFETY: Because buf is a slice, most of the preconditions for // core::slice::from_raw_parts_mut are satisfied: // * buf.as_mut_ptr() is non-null. // * The memory range is within a single allocated object. // * buf is mutable, so the range is valid for both reads // and writes of up to buf.len() * size_of::() bytes. // * The contents of the slice are initialized. // * buf.as_mut_ptr() + buf.len() does not wrap around // the end of the address space. // The remaining precondition is enforced by the borrow checker when this function is called: // * The memory range cannot be accessed through any other pointer for the // duration of lifetime 'a. unsafe { Self::new_uninit(buf.as_mut_ptr(), buf.len()) } } /// Creates a new `Writer` from an uninitialized buffer. /// /// # Safety /// /// The arguments must satisfy the requirements of [`core::slice::from_raw_parts_mut`]. #[inline] pub unsafe fn new_uninit(ptr: *mut u8, len: usize) -> Writer<'a> { // SAFETY: The preconditions for WeakSliceMut::from_raw_parts_mut are the same // as for core::slice::from_raw_parts_mut, and the caller is responsible for // ensuring the latter. let buf = unsafe { WeakSliceMut::from_raw_parts_mut(ptr as *mut MaybeUninit, len) }; Writer { buf, filled: 0 } } #[inline] pub unsafe fn new_uninit_raw(ptr: *mut u8, len: usize, capacity: usize) -> Writer<'a> { let buf = unsafe { WeakSliceMut::from_raw_parts_mut(ptr as *mut MaybeUninit, capacity) }; Writer { buf, filled: len } } /// Pointer to where the next byte will be written #[inline] pub fn next_out(&mut self) -> *mut MaybeUninit { self.buf.as_mut_ptr().wrapping_add(self.filled).cast() } /// Returns the total capacity of the buffer. #[inline] pub fn capacity(&self) -> usize { self.buf.len() } /// Returns the length of the filled part of the buffer #[inline] pub fn len(&self) -> usize { self.filled } /// Returns a shared reference to the filled portion of the buffer. #[inline] pub fn filled(&self) -> &[u8] { // SAFETY: the filled area of the buffer is always initialized, and self.filled is always // in-bounds. unsafe { core::slice::from_raw_parts(self.buf.as_ptr().cast(), self.filled) } } /// Returns the number of bytes at the end of the slice that have not yet been filled. #[inline] pub fn remaining(&self) -> usize { self.capacity() - self.filled } #[inline] pub fn is_full(&self) -> bool { self.filled == self.buf.len() } pub fn push(&mut self, byte: u8) { self.buf.as_mut_slice()[self.filled] = MaybeUninit::new(byte); self.filled += 1; } /// Appends data to the buffer #[inline(always)] pub fn extend(&mut self, buf: &[u8]) { // using simd here (on x86_64) was not fruitful self.buf.as_mut_slice()[self.filled..][..buf.len()].copy_from_slice(slice_to_uninit(buf)); self.filled += buf.len(); } #[inline(always)] pub fn extend_from_window(&mut self, window: &super::window::Window, range: Range) { self.extend_from_window_with_features::<{ CpuFeatures::NONE }>(window, range) } pub fn extend_from_window_with_features( &mut self, window: &super::window::Window, range: Range, ) { match FEATURES { #[cfg(target_arch = "x86_64")] CpuFeatures::AVX2 => self.extend_from_window_help::<32>(window, range), _ => self.extend_from_window_runtime_dispatch(window, range), } } fn extend_from_window_runtime_dispatch( &mut self, window: &super::window::Window, range: Range, ) { // NOTE: the dynamic check for avx512 makes avx2 slower. Measure this carefully before re-enabling // // #[cfg(target_arch = "x86_64")] // if crate::cpu_features::is_enabled_avx512() { // return self.extend_from_window_help::<64>(window, range); // } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2_and_bmi2() { return self.extend_from_window_help::<32>(window, range); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { return self.extend_from_window_help::<16>(window, range); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { return self.extend_from_window_help::<16>(window, range); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { return self.extend_from_window_help::<16>(window, range); } self.extend_from_window_help::<8>(window, range) } #[inline(always)] fn extend_from_window_help( &mut self, window: &super::window::Window, range: Range, ) { let len = range.end - range.start; if self.remaining() >= len + N { // SAFETY: we know that our window has at least a N extra bytes // at the end, making it always safe to perform an (unaligned) Chunk read anywhere in // the window slice. // // The calling function checks for CPU features requirements for C. unsafe { let src = window.as_ptr(); Self::copy_chunk_unchecked::( src.wrapping_add(range.start).cast(), self.next_out(), len, ) } } else { let buf = &window.as_slice()[range]; self.buf.as_mut_slice()[self.filled..][..buf.len()] .copy_from_slice(slice_to_uninit(buf)); } self.filled += len; } /// Variant of `extend_from_window` used with `inflateBack`. It does not attempt a chunked /// copy, because there is no padding at the end and the window and output buffer alias. /// So a standard `memmove` will have to do. #[inline(always)] pub fn extend_from_window_back(&mut self, window: &super::window::Window, range: Range) { let len = range.end - range.start; unsafe { core::ptr::copy( window.as_ptr().add(range.start), self.buf.as_mut_ptr().add(self.filled).cast(), len, ); } self.filled += len; } #[inline(always)] pub fn copy_match(&mut self, offset_from_end: usize, length: usize) { self.copy_match_with_features::<{ CpuFeatures::NONE }>(offset_from_end, length) } #[inline(always)] pub fn copy_match_with_features( &mut self, offset_from_end: usize, length: usize, ) { match FEATURES { #[cfg(target_arch = "x86_64")] CpuFeatures::AVX2 => self.copy_match_help::<32>(offset_from_end, length), _ => self.copy_match_runtime_dispatch(offset_from_end, length), } } fn copy_match_runtime_dispatch(&mut self, offset_from_end: usize, length: usize) { // NOTE: the dynamic check for avx512 makes avx2 slower. Measure this carefully before re-enabling // // #[cfg(target_arch = "x86_64")] // if crate::cpu_features::is_enabled_avx512() { // return self.copy_match_help::(offset_from_end, length); // } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2_and_bmi2() { return self.copy_match_help::<32>(offset_from_end, length); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { return self.copy_match_help::<16>(offset_from_end, length); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { return self.copy_match_help::<16>(offset_from_end, length); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { return self.copy_match_help::<16>(offset_from_end, length); } self.copy_match_help::<8>(offset_from_end, length) } #[inline(always)] fn copy_match_help(&mut self, offset_from_end: usize, length: usize) { let capacity = self.buf.len(); let len = Ord::min(self.filled + length + N, capacity); let buf = &mut self.buf.as_mut_slice()[..len]; let current = self.filled; self.filled += length; // Note also that the referenced string may overlap the current // position; for example, if the last 2 bytes decoded have values // X and Y, a string reference with // adds X,Y,X,Y,X to the output stream. if length > offset_from_end { match offset_from_end { 1 => { // this will just repeat this value many times let element = buf[current - 1]; buf[current..][..length].fill(element); } _ => { // there is a SIMD implementation of this logic, which _should_ be faster, but // isn't in measurements on x86_64. It still might be for other architectures, // adds a lot of complexity and unsafe code. for i in 0..length { buf[current + i] = buf[current - offset_from_end + i]; } } } } else { Self::copy_chunked_within::(buf, capacity, current, offset_from_end, length); } } /// Variant of `copy_match` used with `inflateBack`. It does not attempt a chunked /// copy, because there is no padding at the end. #[inline(always)] pub fn copy_match_back(&mut self, offset_from_end: usize, length: usize) { let capacity = self.buf.len(); let len = Ord::min(self.filled + length, capacity); let buf = &mut self.buf.as_mut_slice()[..len]; let current = self.filled; self.filled += length; // Note also that the referenced string may overlap the current // position; for example, if the last 2 bytes decoded have values // X and Y, a string reference with // adds X,Y,X,Y,X to the output stream. match offset_from_end { 1 => { // this will just repeat this value many times let element = buf[current - 1]; buf[current..][..length].fill(element); } _ => { for i in 0..length { buf[current + i] = buf[current - offset_from_end + i]; } } } } #[inline(always)] fn copy_chunked_within( buf: &mut [MaybeUninit], capacity: usize, current: usize, offset_from_end: usize, length: usize, ) { let start = current.checked_sub(offset_from_end).expect("in bounds"); if current + length + N < capacity { let ptr = buf.as_mut_ptr(); // SAFETY: if statement and checked_sub ensures we stay in bounds. unsafe { Self::copy_chunk_unchecked::(ptr.add(start), ptr.add(current), length) } } else { // a full simd copy does not fit in the output buffer buf.copy_within(start..start + length, current); } } /// # Safety /// /// `src..src + length` must be safe to perform reads in chunks of N elements until /// `src + length` is reached. `dst` must be safe to (unaligned) write that number of chunks. #[inline(always)] unsafe fn copy_chunk_unchecked( mut src: *const MaybeUninit, mut dst: *mut MaybeUninit, length: usize, ) { if length == 0 { return; } // SAFETY: The caller ensured that src + length is within (or just at the end of) // a readable range of bytes. LLVM disallows allocations bigger than isize::MAX, // so if src..src+length is a valid allocation (a precondition of this function) // the length will never exceed isize::MAX. let end = unsafe { src.add(length) }; // SAFETY: We checked above that length != 0, so there is at least one chunk remaining. let chunk = unsafe { load_chunk::(src) }; unsafe { store_chunk::(dst, chunk) }; // SAFETY: src and dest haven't been modified yet, and we checked above that // length != 0, so adding one chunk (N bytes) to both src and dst will result in // a pointer in (or just at the end of) each of the underlying buffers. src = unsafe { src.add(N) }; dst = unsafe { dst.add(N) }; while src < end { // SAFETY: The caller ensured that src and dst contain enough bytes to support // reads (from src) or writes (to dst) up to and including the chunk that contains // end. Note that, if length is not a multiple of N, we will copy up to N-1 bytes // past end. let chunk = unsafe { load_chunk::(src) }; unsafe { store_chunk::(dst, chunk) }; // SAFETY: Because src is currently < end, we have at least one more chunk available // to copy, so there is room to advance the pointers by N within both the src and // dst bufs. src = unsafe { src.add(N) }; dst = unsafe { dst.add(N) }; } } } /// # Safety /// /// Must be valid to read a `[u8; N]` value from `from` with an unaligned read. #[inline(always)] unsafe fn load_chunk(from: *const MaybeUninit) -> [MaybeUninit; N] { // SAFETY: Checked by the caller. unsafe { core::ptr::read_unaligned(from.cast::<[MaybeUninit; N]>()) } } /// # Safety /// /// Must be valid to write a `[u8; N]` value to `out` with an unaligned write. #[inline(always)] unsafe fn store_chunk(out: *mut MaybeUninit, chunk: [MaybeUninit; N]) { // SAFETY: checked by the caller. unsafe { core::ptr::write_unaligned(out.cast(), chunk) } } impl fmt::Debug for Writer<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Writer") .field("ptr", &self.buf.as_ptr()) .field("filled", &self.filled) .field("capacity", &self.capacity()) .finish() } } fn slice_to_uninit(slice: &[u8]) -> &[MaybeUninit] { unsafe { &*(slice as *const [u8] as *const [MaybeUninit]) } } #[cfg(test)] mod test { use super::*; const N: usize = 128; const M: usize = 64; fn test_array() -> [MaybeUninit; N] { core::array::from_fn(|i| MaybeUninit::new(if i < M { i as u8 } else { 0xAAu8 })) } fn test_copy_match(offset_from_end: usize, length: usize) { let mut buf = test_array(); let mut writer = Writer { buf: unsafe { WeakSliceMut::from_raw_parts_mut(buf.as_mut_ptr(), buf.len()) }, filled: M, }; writer.copy_match(offset_from_end, length); assert_eq!(writer.filled, M + length); let mut naive = test_array(); for i in 0..length { naive[M + i] = naive[M - offset_from_end + i]; } let buf = unsafe { core::mem::transmute::<[MaybeUninit; 128], [u8; N]>(buf) }; let naive = unsafe { core::mem::transmute::<[MaybeUninit; 128], [u8; N]>(naive) }; assert_eq!( buf[M..][..length], naive[M..][..length], "{offset_from_end} {length}" ); } #[test] fn copy_chunk_unchecked() { let offset_from_end = 17; let length = 17; macro_rules! helper { ($func:expr) => { let mut buf = test_array(); let mut writer = Writer { buf: unsafe { WeakSliceMut::from_raw_parts_mut(buf.as_mut_ptr(), buf.len()) }, filled: M, }; $func(&mut writer, offset_from_end, length); }; } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx512() { helper!(Writer::copy_match_help::<64>); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2_and_bmi2() { helper!(Writer::copy_match_help::<32>); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { helper!(Writer::copy_match_help::<16>); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { helper!(Writer::copy_match_help::<16>); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { helper!(Writer::copy_match_help::<16>); } helper!(Writer::copy_match_help::<8>); } #[test] fn copy_match() { for offset_from_end in 1..=64 { for length in 0..=64 { test_copy_match(offset_from_end, length) } } } #[test] fn copy_match_insufficient_space_for_simd() { let mut buf = [1, 2, 3, 0xAA, 0xAA].map(MaybeUninit::new); let mut writer = Writer { buf: unsafe { WeakSliceMut::from_raw_parts_mut(buf.as_mut_ptr(), buf.len()) }, filled: 3, }; writer.copy_match(3, 2); assert_eq!(buf.map(|e| unsafe { e.assume_init() }), [1, 2, 3, 1, 2]); } }