//! Lex TOML tokens //! //! To get started, see [`Source::lex`][crate::Source::lex] #[cfg(test)] #[cfg(feature = "std")] mod test; mod token; #[cfg(feature = "alloc")] use alloc::vec::Vec; use winnow::stream::AsBStr as _; use winnow::stream::ContainsToken as _; use winnow::stream::FindSlice as _; use winnow::stream::Location; use winnow::stream::Stream as _; use crate::Span; pub use token::Token; pub use token::TokenKind; /// Lex TOML [tokens][Token] /// /// To get started, see [`Source::lex`][crate::Source::lex] pub struct Lexer<'i> { stream: Stream<'i>, eof: bool, } impl<'i> Lexer<'i> { pub(crate) fn new(input: &'i str) -> Self { let mut stream = Stream::new(input); if input.as_bytes().starts_with(BOM) { let offset = BOM.len(); #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); } Lexer { stream, eof: false } } #[cfg(feature = "alloc")] pub fn into_vec(self) -> Vec { #![allow(unused_qualifications)] // due to MSRV of 1.66 let capacity = core::cmp::min( self.stream.len(), usize::MAX / core::mem::size_of::(), ); let mut vec = Vec::with_capacity(capacity); vec.extend(self); vec } } impl Iterator for Lexer<'_> { type Item = Token; fn next(&mut self) -> Option { let Some(peek_byte) = self.stream.as_bstr().first() else { if self.eof { return None; } else { self.eof = true; let start = self.stream.current_token_start(); let span = Span::new_unchecked(start, start); return Some(Token::new(TokenKind::Eof, span)); } }; Some(process_token(*peek_byte, &mut self.stream)) } } const BOM: &[u8] = b"\xEF\xBB\xBF"; pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>; fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token { let token = match peek_byte { b'.' => lex_ascii_char(stream, TokenKind::Dot), b'=' => lex_ascii_char(stream, TokenKind::Equals), b',' => lex_ascii_char(stream, TokenKind::Comma), b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket), b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket), b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket), b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket), b' ' => lex_whitespace(stream), b'\t' => lex_whitespace(stream), b'#' => lex_comment(stream), b'\r' => lex_crlf(stream), b'\n' => lex_ascii_char(stream, TokenKind::Newline), b'\'' => { if stream.starts_with(ML_LITERAL_STRING_DELIM) { lex_ml_literal_string(stream) } else { lex_literal_string(stream) } } b'"' => { if stream.starts_with(ML_BASIC_STRING_DELIM) { lex_ml_basic_string(stream) } else { lex_basic_string(stream) } } _ => lex_atom(stream), }; token } /// Process an ASCII character token /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream` must be non-empty /// - `stream[0]` must be ASCII fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token { debug_assert!(!stream.is_empty()); let start = stream.current_token_start(); let offset = 1; // an ascii character #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(kind, span) } /// Process Whitespace /// /// ```bnf /// ;; Whitespace /// /// ws = *wschar /// wschar = %x20 ; Space /// wschar =/ %x09 ; Horizontal tab /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream` must be non-empty fn lex_whitespace(stream: &mut Stream<'_>) -> Token { debug_assert!(!stream.is_empty()); let start = stream.current_token_start(); let offset = stream .as_bstr() .offset_for(|b| !WSCHAR.contains_token(b)) .unwrap_or(stream.eof_offset()); #[cfg(feature = "unsafe")] // SAFETY: WSCHAR ensures `offset` will be at UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::Whitespace, span) } /// ```bnf /// wschar = %x20 ; Space /// wschar =/ %x09 ; Horizontal tab /// ``` pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t'); /// Process Comment /// /// ```bnf /// ;; Comment /// /// comment-start-symbol = %x23 ; # /// non-ascii = %x80-D7FF / %xE000-10FFFF /// non-eol = %x09 / %x20-7F / non-ascii /// /// comment = comment-start-symbol *non-eol /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream[0] == b'#'` fn lex_comment(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let offset = stream .as_bytes() .find_slice((b'\r', b'\n')) .map(|s| s.start) .unwrap_or_else(|| stream.eof_offset()); #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::Comment, span) } /// `comment-start-symbol = %x23 ; #` pub(crate) const COMMENT_START_SYMBOL: u8 = b'#'; /// Process Newline /// /// ```bnf /// ;; Newline /// /// newline = %x0A ; LF /// newline =/ %x0D.0A ; CRLF /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream[0] == b'\r'` fn lex_crlf(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let mut offset = '\r'.len_utf8(); let has_lf = stream.as_bstr().get(1) == Some(&b'\n'); if has_lf { offset += '\n'.len_utf8(); } #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::Newline, span) } /// Process literal string /// /// ```bnf /// ;; Literal String /// /// literal-string = apostrophe *literal-char apostrophe /// /// apostrophe = %x27 ; ' apostrophe /// /// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream[0] == b'\''` fn lex_literal_string(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let offset = 1; // APOSTROPHE #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) { Some(span) => { if stream.as_bstr()[span.start] == APOSTROPHE { span.end } else { span.start } } None => stream.eof_offset(), }; #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE`/newline ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::LiteralString, span) } /// `apostrophe = %x27 ; ' apostrophe` pub(crate) const APOSTROPHE: u8 = b'\''; /// Process multi-line literal string /// /// ```bnf /// ;; Multiline Literal String /// /// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body /// ml-literal-string-delim /// ml-literal-string-delim = 3apostrophe /// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] /// /// mll-content = mll-char / newline /// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii /// mll-quotes = 1*2apostrophe /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream.starts_with(ML_LITERAL_STRING_DELIM)` fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let offset = ML_LITERAL_STRING_DELIM.len(); #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) { Some(span) => span.end, None => stream.eof_offset(), }; #[cfg(feature = "unsafe")] // SAFETY: `ML_LITERAL_STRING_DELIM` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); if stream.as_bstr().peek_token() == Some(APOSTROPHE) { let offset = 1; #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); if stream.as_bstr().peek_token() == Some(APOSTROPHE) { let offset = 1; #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); } } let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::MlLiteralString, span) } /// `ml-literal-string-delim = 3apostrophe` pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''"; /// Process basic string /// /// ```bnf /// ;; Basic String /// /// basic-string = quotation-mark *basic-char quotation-mark /// /// quotation-mark = %x22 ; " /// /// basic-char = basic-unescaped / escaped /// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii /// escaped = escape escape-seq-char /// /// escape = %x5C ; \ /// escape-seq-char = %x22 ; " quotation mark U+0022 /// escape-seq-char =/ %x5C ; \ reverse solidus U+005C /// escape-seq-char =/ %x62 ; b backspace U+0008 /// escape-seq-char =/ %x66 ; f form feed U+000C /// escape-seq-char =/ %x6E ; n line feed U+000A /// escape-seq-char =/ %x72 ; r carriage return U+000D /// escape-seq-char =/ %x74 ; t tab U+0009 /// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX /// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream[0] == b'"'` fn lex_basic_string(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let offset = 1; // QUOTATION_MARK #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); loop { // newline is present for error recovery match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) { Some(span) => { let found = stream.as_bstr()[span.start]; if found == QUOTATION_MARK { let offset = span.end; #[cfg(feature = "unsafe")] // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); break; } else if found == ESCAPE { let offset = span.end; #[cfg(feature = "unsafe")] // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let peek = stream.as_bstr().peek_token(); match peek { Some(ESCAPE) | Some(QUOTATION_MARK) => { let offset = 1; // ESCAPE / QUOTATION_MARK #[cfg(feature = "unsafe")] #[cfg(feature = "unsafe")] // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); } _ => {} } continue; } else if found == b'\n' { let offset = span.start; #[cfg(feature = "unsafe")] // SAFETY: newline ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); break; } else { unreachable!("found `{found}`"); } } None => { stream.finish(); break; } } } let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::BasicString, span) } /// `quotation-mark = %x22 ; "` pub(crate) const QUOTATION_MARK: u8 = b'"'; /// `escape = %x5C ; \` pub(crate) const ESCAPE: u8 = b'\\'; /// Process multi-line basic string /// /// ```bnf /// ;; Multiline Basic String /// /// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body /// ml-basic-string-delim /// ml-basic-string-delim = 3quotation-mark /// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] /// /// mlb-content = mlb-char / newline / mlb-escaped-nl /// mlb-char = mlb-unescaped / escaped /// mlb-quotes = 1*2quotation-mark /// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii /// mlb-escaped-nl = escape ws newline *( wschar / newline ) /// ``` /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream.starts_with(ML_BASIC_STRING_DELIM)` fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); let offset = ML_BASIC_STRING_DELIM.len(); #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); loop { // newline is present for error recovery match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) { Some(span) => { let found = stream.as_bstr()[span.start]; if found == QUOTATION_MARK { let offset = span.end; #[cfg(feature = "unsafe")] // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); break; } else if found == ESCAPE { let offset = span.end; #[cfg(feature = "unsafe")] // SAFETY: `ESCAPE` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let peek = stream.as_bstr().peek_token(); match peek { Some(ESCAPE) | Some(QUOTATION_MARK) => { let offset = 1; // ESCAPE / QUOTATION_MARK #[cfg(feature = "unsafe")] // SAFETY: `QUOTATION_MARK`/`ESCAPE` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); } _ => {} } continue; } else { unreachable!("found `{found}`"); } } None => { stream.finish(); break; } } } if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) { let offset = 1; #[cfg(feature = "unsafe")] // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) { let offset = 1; #[cfg(feature = "unsafe")] // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); } } let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::MlBasicString, span) } /// `ml-basic-string-delim = 3quotation-mark` pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\""; /// Process Atom /// /// This is everything else /// /// # Safety /// /// - `stream` must be UTF-8 /// - `stream` must be non-empty fn lex_atom(stream: &mut Stream<'_>) -> Token { let start = stream.current_token_start(); // Intentionally leaves off quotes in case the opening quote was missing const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n"; let offset = stream .as_bstr() .offset_for(|b| TOKEN_START.contains_token(b)) .unwrap_or_else(|| stream.eof_offset()); #[cfg(feature = "unsafe")] // SAFETY: `TOKEN_START` ensure `offset` is along UTF-8 boundary unsafe { stream.next_slice_unchecked(offset) }; #[cfg(not(feature = "unsafe"))] stream.next_slice(offset); let end = stream.previous_token_end(); let span = Span::new_unchecked(start, end); Token::new(TokenKind::Atom, span) }