#[cfg(feature = "encoding")] use encoding_rs::UTF_8; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, Result, SyntaxError}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; use crate::reader::{BangType, Config, ParseState}; use crate::utils::{is_whitespace, name_len}; /// A struct that holds a current reader state and a parser configuration. /// It is independent on a way of reading data: the reader feed data into it and /// get back produced [`Event`]s. #[derive(Clone, Debug)] pub(super) struct ReaderState { /// Number of bytes read from the source of data since the reader was created pub offset: u64, /// A snapshot of an `offset` of the last error returned. It can be less than /// `offset`, because some errors conveniently report at earlier position, /// and changing `offset` is not possible, because `Error::IllFormed` errors /// are recoverable. pub last_error_offset: u64, /// Defines how to process next byte pub state: ParseState, /// User-defined settings that affect parsing pub config: Config, /// All currently Started elements which didn't have a matching /// End element yet. /// /// For an XML /// /// ```xml /// | /// ``` /// when cursor at the `|` position buffer contains: /// /// ```text /// rootinner /// ^ ^ /// ``` /// /// The `^` symbols shows which positions stored in the [`Self::opened_starts`] /// (0 and 4 in that case). opened_buffer: Vec, /// Opened name start indexes into [`Self::opened_buffer`]. See documentation /// for that field for details opened_starts: Vec, #[cfg(feature = "encoding")] /// Reference to the encoding used to read an XML pub encoding: EncodingRef, } impl ReaderState { /// Trims end whitespaces from `bytes`, if required, and returns a text event. /// /// # Parameters /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> { let mut content = bytes; if self.config.trim_text_end { // Skip the ending '<' let len = bytes .iter() .rposition(|&b| !is_whitespace(b)) .map_or(0, |p| p + 1); content = &bytes[..len]; } BytesText::wrap(content, self.decoder()) } /// Returns `Comment`, `CData` or `DocType` event. /// /// `buf` contains data between `<` and `>`: /// - CDATA: `![CDATA[...]]` /// - Comment: `!--...--` /// - Doctype (uppercase): `!D...` /// - Doctype (lowercase): `!d...` pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { debug_assert_eq!( buf.first(), Some(&b'!'), "CDATA, comment or DOCTYPE should start from '!'" ); let uncased_starts_with = |string: &[u8], prefix: &[u8]| { string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) }; let len = buf.len(); match bang_type { BangType::Comment if buf.starts_with(b"!--") => { debug_assert!(buf.ends_with(b"--")); if self.config.check_comments { // search if '--' not in comments let mut haystack = &buf[3..len - 2]; let mut off = 0; while let Some(p) = memchr::memchr(b'-', haystack) { off += p + 1; // if next byte after `-` is also `-`, return an error if buf[3 + off] == b'-' { // Explanation of the magic: // // - `self.offset`` just after `>`, // - `buf` contains `!-- con--tent --` // - `p` is counted from byte after `: // ~~~~~~~~~~~~~~~~ : - buf // : =========== : - zone of search (possible values of `p`) // : |---p : - p is counted from | (| is 0) // : : : ^ - self.offset // ^ : : - self.offset - len // ^ : - self.offset - len + 2 // ^ - self.offset - len + 2 + p self.last_error_offset = self.offset - len as u64 + 2 + p as u64; return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } // Continue search after single `-` (+1 to skip it) haystack = &haystack[p + 1..]; } } Ok(Event::Comment(BytesText::wrap( // Cut of `!--` and `--` from start and end &buf[3..len - 2], self.decoder(), ))) } // XML requires uppercase only: // https://www.w3.org/TR/xml11/#sec-cdata-sect // Even HTML5 required uppercase only: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::CData if buf.starts_with(b"![CDATA[") => { debug_assert!(buf.ends_with(b"]]")); Ok(Event::CData(BytesCData::wrap( // Cut of `![CDATA[` and `]]` from start and end &buf[8..len - 2], self.decoder(), ))) } // XML requires uppercase only, but we will check that on validation stage: // https://www.w3.org/TR/xml11/#sec-prolog-dtd // HTML5 allows mixed case for doctype declarations: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( // Cut of `!DOCTYPE` and any number of spaces from start &buf[8 + start..], self.decoder(), ))), None => { // Because we here, we at least read `` and offset after `>`. // We want report error at place where name is expected - this is just // before `>` self.last_error_offset = self.offset - 1; return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); } } } _ => { // // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. // ^------- We report error at that position, so we need to subtract 2 and buf len self.last_error_offset = self.offset - len as u64 - 2; Err(bang_type.to_err().into()) } } } /// Wraps content of `buf` into the [`Event::End`] event. Does the check that /// end name matches the last opened start name if `self.config.check_end_names` is set. /// /// `buf` contains data between `<` and `>`, for example `/tag`. pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result> { debug_assert_eq!( buf.first(), Some(&b'/'), "closing tag should start from '/'" ); // Strip the `/` character. `content` contains data between `` let content = &buf[1..]; // XML standard permits whitespaces after the markup name in closing tags. // Let's strip them from the buffer before comparing tag names. let name = if self.config.trim_markup_names_in_closing_tags { if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { &content[..pos_end_name + 1] } else { content } } else { content }; let decoder = self.decoder(); // Get the index in self.opened_buffer of the name of the last opened tag match self.opened_starts.pop() { Some(start) => { if self.config.check_end_names { let expected = &self.opened_buffer[start..]; if name != expected { let expected = decoder.decode(expected).unwrap_or_default().into_owned(); // #513: In order to allow error recovery we should drop content of the buffer self.opened_buffer.truncate(start); // Report error at start of the end tag at `<` character // -2 for `<` and `>` self.last_error_offset = self.offset - buf.len() as u64 - 2; return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { expected, found: decoder.decode(name).unwrap_or_default().into_owned(), })); } } self.opened_buffer.truncate(start); } None => { if !self.config.allow_unmatched_ends { // Report error at start of the end tag at `<` character // -2 for `<` and `>` self.last_error_offset = self.offset - buf.len() as u64 - 2; return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( decoder.decode(name).unwrap_or_default().into_owned(), ))); } } } Ok(Event::End(BytesEnd::wrap(name.into()))) } /// `buf` contains data between `<` and `>` and the first byte is `?`. /// `self.offset` already after the `>` /// /// Returns `Decl` or `PI` event pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result> { debug_assert!(buf.len() > 0); debug_assert_eq!(buf[0], b'?'); let len = buf.len(); // We accept at least // ~~ - len = 2 if len > 1 && buf[len - 1] == b'?' { // Cut of `?` and `?` from start and end let content = &buf[1..len - 1]; let len = content.len(); if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); // Try getting encoding from the declaration event #[cfg(feature = "encoding")] if self.encoding.can_be_refined() { if let Some(encoding) = event.encoder() { self.encoding = EncodingRef::XmlDetected(encoding); } } Ok(Event::Decl(event)) } else { Ok(Event::PI(BytesPI::wrap(content, name_len(content)))) } } else { // `) self.last_error_offset = self.offset - len as u64 - 2; Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) } } /// Converts content of a tag to a `Start` or an `Empty` event /// /// # Parameters /// - `content`: Content of a tag between `<` and `>` pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> { if let Some(content) = content.strip_suffix(b"/") { // This is self-closed tag `` let event = BytesStart::wrap(content, name_len(content)); if self.config.expand_empty_elements { self.state = ParseState::InsideEmpty; self.opened_starts.push(self.opened_buffer.len()); self.opened_buffer.extend(event.name().as_ref()); Event::Start(event) } else { Event::Empty(event) } } else { let event = BytesStart::wrap(content, name_len(content)); // #514: Always store names event when .check_end_names == false, // because checks can be temporary disabled and when they would be // enabled, we should have that information self.opened_starts.push(self.opened_buffer.len()); self.opened_buffer.extend(event.name().as_ref()); Event::Start(event) } } #[inline] pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> { self.state = ParseState::InsideText; let name = self .opened_buffer .split_off(self.opened_starts.pop().unwrap()); BytesEnd::wrap(name.into()) } /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// /// If [`encoding`] feature is enabled, the used encoding may change after /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. /// /// If [`encoding`] feature is enabled and no encoding is specified in declaration, /// defaults to UTF-8. /// /// [`encoding`]: ../../index.html#encoding pub const fn decoder(&self) -> Decoder { Decoder { #[cfg(feature = "encoding")] encoding: self.encoding.encoding(), } } } impl Default for ReaderState { fn default() -> Self { Self { offset: 0, last_error_offset: 0, state: ParseState::Init, config: Config::default(), opened_buffer: Vec::new(), opened_starts: Vec::new(), #[cfg(feature = "encoding")] encoding: EncodingRef::Implicit(UTF_8), } } }