use std::io::{Read, Seek, SeekFrom}; use memchr::memmem::{Finder, FinderRev}; use crate::result::ZipResult; pub trait FinderDirection<'a> { fn new(needle: &'a [u8]) -> Self; fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64; fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize); fn needle(&self) -> &[u8]; fn find(&self, haystack: &[u8]) -> Option; fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option; fn move_scope(&self, offset: usize) -> usize; } pub struct Forward<'a>(Finder<'a>); impl<'a> FinderDirection<'a> for Forward<'a> { fn new(needle: &'a [u8]) -> Self { Self(Finder::new(needle)) } fn reset_cursor((start_inclusive, _): (u64, u64), _: usize) -> u64 { start_inclusive } fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) { (&window[mid_window_offset..], mid_window_offset) } fn find(&self, haystack: &[u8]) -> Option { self.0.find(haystack) } fn needle(&self) -> &[u8] { self.0.needle() } fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option { let magic_overlap = self.needle().len().saturating_sub(1) as u64; let next = cursor.saturating_add(window_size as u64 - magic_overlap); if next >= bounds.1 { None } else { Some(next) } } fn move_scope(&self, offset: usize) -> usize { offset + self.needle().len() } } pub struct Backwards<'a>(FinderRev<'a>); impl<'a> FinderDirection<'a> for Backwards<'a> { fn new(needle: &'a [u8]) -> Self { Self(FinderRev::new(needle)) } fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64 { bounds .1 .saturating_sub(window_size as u64) .clamp(bounds.0, bounds.1) } fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) { (&window[..mid_window_offset], 0) } fn find(&self, haystack: &[u8]) -> Option { self.0.rfind(haystack) } fn needle(&self) -> &[u8] { self.0.needle() } fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option { let magic_overlap = self.needle().len().saturating_sub(1) as u64; if cursor <= bounds.0 { None } else { Some( cursor .saturating_add(magic_overlap) .saturating_sub(window_size as u64) .clamp(bounds.0, bounds.1), ) } } fn move_scope(&self, offset: usize) -> usize { offset } } /// A utility for finding magic symbols from the end of a seekable reader. /// /// Can be repurposed to recycle the internal buffer. pub struct MagicFinder { buffer: Box<[u8]>, pub(self) finder: Direction, cursor: u64, mid_buffer_offset: Option, bounds: (u64, u64), } impl<'a, T: FinderDirection<'a>> MagicFinder { /// Create a new magic bytes finder to look within specific bounds. pub fn new(magic_bytes: &'a [u8], start_inclusive: u64, end_exclusive: u64) -> Self { const BUFFER_SIZE: usize = 2048; // Smaller buffer size would be unable to locate bytes. // Equal buffer size would stall (the window could not be moved). debug_assert!(BUFFER_SIZE >= magic_bytes.len()); Self { buffer: vec![0; BUFFER_SIZE].into_boxed_slice(), finder: T::new(magic_bytes), cursor: T::reset_cursor((start_inclusive, end_exclusive), BUFFER_SIZE), mid_buffer_offset: None, bounds: (start_inclusive, end_exclusive), } } /// Repurpose the finder for different bytes or bounds. pub fn repurpose(&mut self, magic_bytes: &'a [u8], bounds: (u64, u64)) -> &mut Self { debug_assert!(self.buffer.len() >= magic_bytes.len()); self.finder = T::new(magic_bytes); self.cursor = T::reset_cursor(bounds, self.buffer.len()); self.bounds = bounds; // Reset the mid-buffer offset, to invalidate buffer content. self.mid_buffer_offset = None; self } /// Find the next magic bytes in the direction specified in the type. pub fn next(&mut self, reader: &mut R) -> ZipResult> { loop { if self.cursor < self.bounds.0 || self.cursor >= self.bounds.1 { // The finder is consumed break; } /* Position the window and ensure correct length */ let window_start = self.cursor; let window_end = self .cursor .saturating_add(self.buffer.len() as u64) .min(self.bounds.1); if window_end <= window_start { // Short-circuit on zero-sized windows to prevent loop break; } let window = &mut self.buffer[..(window_end - window_start) as usize]; if self.mid_buffer_offset.is_none() { reader.seek(SeekFrom::Start(window_start))?; reader.read_exact(window)?; } let (window, window_start_offset) = match self.mid_buffer_offset { Some(mid_buffer_offset) => T::scope_window(window, mid_buffer_offset), None => (&*window, 0usize), }; if let Some(offset) = self.finder.find(window) { let magic_pos = window_start + window_start_offset as u64 + offset as u64; reader.seek(SeekFrom::Start(magic_pos))?; self.mid_buffer_offset = Some(self.finder.move_scope(window_start_offset + offset)); return Ok(Some(magic_pos)); } self.mid_buffer_offset = None; match self .finder .move_cursor(self.cursor, self.bounds, self.buffer.len()) { Some(new_cursor) => { self.cursor = new_cursor; } None => { // Destroy the finder when we've reached the end of the bounds. self.bounds.0 = self.bounds.1; break; } } } Ok(None) } } /// A magic bytes finder with an optimistic guess that is tried before /// the inner finder begins searching from end. This enables much faster /// lookup in files without appended junk, because the magic bytes will be /// found directly. /// /// The guess can be marked as mandatory to produce an error. This is useful /// if the ArchiveOffset is known and auto-detection is not desired. pub struct OptimisticMagicFinder { inner: MagicFinder, initial_guess: Option<(u64, bool)>, } /// This is a temporary restriction, to avoid heap allocation in [`Self::next_back`]. /// /// We only use magic bytes of size 4 at the moment. const STACK_BUFFER_SIZE: usize = 8; impl<'a, Direction: FinderDirection<'a>> OptimisticMagicFinder { /// Create a new empty optimistic magic bytes finder. pub fn new_empty() -> Self { Self { inner: MagicFinder::new(&[], 0, 0), initial_guess: None, } } /// Repurpose the finder for different bytes, bounds and initial guesses. pub fn repurpose( &mut self, magic_bytes: &'a [u8], bounds: (u64, u64), initial_guess: Option<(u64, bool)>, ) -> &mut Self { debug_assert!(magic_bytes.len() <= STACK_BUFFER_SIZE); self.inner.repurpose(magic_bytes, bounds); self.initial_guess = initial_guess; self } /// Equivalent to `next_back`, with an optional initial guess attempted before /// proceeding with reading from the back of the reader. pub fn next(&mut self, reader: &mut R) -> ZipResult> { if let Some((v, mandatory)) = self.initial_guess { reader.seek(SeekFrom::Start(v))?; let mut buffer = [0; STACK_BUFFER_SIZE]; let buffer = &mut buffer[..self.inner.finder.needle().len()]; // Attempt to match only if there's enough space for the needle if v.saturating_add(buffer.len() as u64) <= self.inner.bounds.1 { reader.read_exact(buffer)?; // If a match is found, yield it. if self.inner.finder.needle() == buffer { self.initial_guess.take(); reader.seek(SeekFrom::Start(v))?; return Ok(Some(v)); } } // If a match is not found, but the initial guess was mandatory, return an error. if mandatory { return Ok(None); } // If the initial guess was not mandatory, remove it, as it was not found. self.initial_guess.take(); } self.inner.next(reader) } }