diff --git a/src/lib.rs b/src/lib.rs index 7581f2cff..a493f0902 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -236,6 +236,7 @@ pub mod _tutorial; /// ``` pub mod prelude { pub use crate::stream::StreamIsPartial as _; + pub use crate::stream::A; pub use crate::IResult; pub use crate::PResult; pub use crate::Parser; diff --git a/src/parser.rs b/src/parser.rs index f002de678..966f60e3d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -5,6 +5,7 @@ use crate::combinator::*; #[cfg(feature = "unstable-recover")] use crate::error::FromRecoverableError; use crate::error::{AddContext, FromExternalError, IResult, PResult, ParseError, ParserError}; +use crate::stream::AsciiChar; use crate::stream::{AsChar, Compare, Location, ParseSlice, Stream, StreamIsPartial}; #[cfg(feature = "unstable-recover")] use crate::stream::{Recover, Recoverable}; @@ -743,6 +744,34 @@ where } } +/// This is a shortcut for [`one_of`][crate::token::one_of]. +/// +/// # Example +/// +/// ``` +/// # use winnow::prelude::*; +/// # use winnow::{error::ErrMode, error::{ErrorKind, InputError}}; +/// fn parser<'s>(i: &mut &'s str) -> PResult> { +/// A!('a').parse_next(i) +/// } +/// assert_eq!(parser.parse_peek("abc"), Ok(("bc", 'a'))); +/// assert_eq!(parser.parse_peek(" abc"), Err(ErrMode::Backtrack(InputError::new(" abc", ErrorKind::Verify)))); +/// assert_eq!(parser.parse_peek("bc"), Err(ErrMode::Backtrack(InputError::new("bc", ErrorKind::Verify)))); +/// assert_eq!(parser.parse_peek(""), Err(ErrMode::Backtrack(InputError::new("", ErrorKind::Token)))); +/// ``` +impl Parser::Token, E> for AsciiChar +where + I: StreamIsPartial, + I: Stream, + ::Token: AsChar + Clone, + E: ParserError, +{ + #[inline(always)] + fn parse_next(&mut self, i: &mut I) -> PResult<::Token, E> { + crate::token::one_of(*self).parse_next(i) + } +} + /// This is a shortcut for [`one_of`][crate::token::one_of]. /// /// # Example diff --git a/src/stream/ascii.rs b/src/stream/ascii.rs new file mode 100644 index 000000000..f7b25eb65 --- /dev/null +++ b/src/stream/ascii.rs @@ -0,0 +1,180 @@ +/// One of the 128 Unicode characters from U+0000 through U+007F, +/// often known as the [ASCII] subset. +/// +/// Officially, this is the first [block] in Unicode, _Basic Latin_. +/// For details, see the [*C0 Controls and Basic Latin*][chart] code chart. +/// +/// This block was based on older 7-bit character code standards such as +/// ANSI X3.4-1977, ISO 646-1973, and [NIST FIPS 1-2]. +/// +/// **Note:** This is a polyfill for [`ascii::Char`][std::ascii::Char]. +/// +/// # When to use this +/// +/// The main advantage of this subset is that it's always valid UTF-8. As such, +/// the `&[ascii::Char]` -> `&str` conversion function (as well as other related +/// ones) are O(1): *no* runtime checks are needed. +/// +/// If you're consuming strings, you should usually handle Unicode and thus +/// accept `str`s, not limit yourself to `ascii::Char`s. +/// +/// However, certain formats are intentionally designed to produce ASCII-only +/// output in order to be 8-bit-clean. In those cases, it can be simpler and +/// faster to generate `ascii::Char`s instead of dealing with the variable width +/// properties of general UTF-8 encoded strings, while still allowing the result +/// to be used freely with other Rust things that deal in general `str`s. +/// +/// For example, a UUID library might offer a way to produce the string +/// representation of a UUID as an `[ascii::Char; 36]` to avoid memory +/// allocation yet still allow it to be used as UTF-8 via `as_str` without +/// paying for validation (or needing `unsafe` code) the way it would if it +/// were provided as a `[u8; 36]`. +/// +/// # Layout +/// +/// This type is guaranteed to have a size and alignment of 1 byte. +/// +/// # Names +/// +/// The variants on this type are [Unicode names][NamesList] of the characters +/// in upper camel case, with a few tweaks: +/// - For `` characters, the primary alias name is used. +/// - `LATIN` is dropped, as this block has no non-latin letters. +/// - `LETTER` is dropped, as `CAPITAL`/`SMALL` suffices in this block. +/// - `DIGIT`s use a single digit rather than writing out `ZERO`, `ONE`, etc. +/// +/// [ASCII]: https://www.unicode.org/glossary/index.html#ASCII +/// [block]: https://www.unicode.org/glossary/index.html#block +/// [chart]: https://www.unicode.org/charts/PDF/U0000.pdf +/// [NIST FIPS 1-2]: https://nvlpubs.nist.gov/nistpubs/Legacy/FIPS/fipspub1-2-1977.pdf +/// [NamesList]: https://www.unicode.org/Public/15.0.0/ucd/NamesList.txt +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct AsciiChar(u8); + +impl AsciiChar { + /// Creates an ascii character from the byte `b`, + /// or returns `None` if it's too large. + #[inline(always)] + pub const fn from_u8(b: u8) -> Option { + if b <= 127 { + // SAFETY: Just checked that `b` is in-range + Some(unsafe { Self::from_u8_unchecked(b) }) + } else { + None + } + } + + /// Creates an ASCII character from the byte `b`, + /// without checking whether it's valid. + /// + /// # Safety + /// + /// `b` must be in `0..=127`, or else this is UB. + #[inline(always)] + pub const unsafe fn from_u8_unchecked(b: u8) -> Self { + Self(b) + } + + /// Gets this ASCII character as a byte. + #[inline(always)] + pub const fn to_u8(self) -> u8 { + self.0 as u8 + } + + /// Gets this ASCII character as a `char` Unicode Scalar Value. + #[inline(always)] + pub const fn to_char(self) -> char { + self.0 as char + } +} + +impl crate::lib::std::fmt::Display for AsciiChar { + fn fmt(&self, f: &mut crate::lib::std::fmt::Formatter<'_>) -> crate::lib::std::fmt::Result { + self.to_char().fmt(f) + } +} + +impl crate::lib::std::fmt::Debug for AsciiChar { + fn fmt(&self, f: &mut crate::lib::std::fmt::Formatter<'_>) -> crate::lib::std::fmt::Result { + self.to_char().fmt(f) + } +} + +/// Create an [`AsciiChar`] with compile-time validation +#[macro_export] +#[doc(hidden)] // forced to be visible in intended location +macro_rules! A { + ($byte: literal) => {{ + #![allow(clippy::unnecessary_cast)] // not always the same type + + const BYTE: char = $byte as char; + const MAX: char = 127 as char; + const C: $crate::stream::AsciiChar = if BYTE <= MAX { + unsafe { $crate::stream::AsciiChar::from_u8_unchecked(BYTE as u8) } + } else { + panic!() + }; + C + }}; +} + +/// Create an [`&[AsciiChar]`] with compile-time validation +#[macro_export] +#[doc(hidden)] // forced to be visible in intended location +macro_rules! AS { + ($s: literal) => {{ + #![allow(clippy::unnecessary_cast)] // not always the same type + + const S: &'static str = $s; + const BYTES: &'static [u8] = unsafe { core::mem::transmute(S) }; + let mut i = 0; + while i < BYTES.len() { + let byte = BYTES[0]; + if byte <= 127 { + } else { + panic!() + }; + i += 1; + } + const AS: &'static [$crate::stream::AsciiChar] = unsafe { core::mem::transmute(BYTES) }; + AS + }}; +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn const_number() { + const fn gen() -> AsciiChar { + crate::stream::A!(97) + } + assert_eq!(gen(), AsciiChar::from_u8(b'a').unwrap()); + } + + #[test] + fn const_u8() { + const fn gen() -> AsciiChar { + crate::stream::A!(b'a') + } + assert_eq!(gen(), AsciiChar::from_u8(b'a').unwrap()); + } + + #[test] + fn const_char() { + const fn gen() -> AsciiChar { + crate::stream::A!('a') + } + assert_eq!(gen(), AsciiChar::from_u8(b'a').unwrap()); + } + + #[test] + fn const_str() { + const fn gen() -> &'static [AsciiChar] { + crate::stream::AS!("a") + } + const S: &'static [AsciiChar] = gen(); + dbg!(S); + } +} diff --git a/src/stream/mod.rs b/src/stream/mod.rs index dc0a2401a..4fc8848b8 100644 --- a/src/stream/mod.rs +++ b/src/stream/mod.rs @@ -39,6 +39,7 @@ use crate::lib::std::string::String; #[cfg(feature = "alloc")] use crate::lib::std::vec::Vec; +mod ascii; mod impls; #[cfg(test)] mod tests; @@ -46,6 +47,10 @@ mod tests; /// UTF-8 Stream pub type Str<'i> = &'i str; +pub use crate::A; +pub use crate::AS; +pub use ascii::AsciiChar; + /// Improved `Debug` experience for `&[u8]` byte streams #[allow(clippy::derive_hash_xor_eq)] #[derive(Hash)] @@ -461,6 +466,13 @@ impl SliceLen for u8 { } } +impl SliceLen for AsciiChar { + #[inline(always)] + fn slice_len(&self) -> usize { + 1 + } +} + impl SliceLen for char { #[inline(always)] fn slice_len(&self) -> usize { @@ -2166,6 +2178,20 @@ impl<'a> Compare> for &'a [u8] { } } +impl<'a> Compare for &'a [u8] { + #[inline(always)] + fn compare(&self, t: AsciiChar) -> CompareResult { + self.compare(t.to_u8()) + } +} + +impl<'a> Compare> for &'a [u8] { + #[inline(always)] + fn compare(&self, t: AsciiCaseless) -> CompareResult { + self.compare(AsciiCaseless(t.0.to_u8())) + } +} + impl<'a> Compare for &'a [u8] { #[inline(always)] fn compare(&self, t: char) -> CompareResult { @@ -2208,6 +2234,20 @@ impl<'a> Compare> for &'a str { } } +impl<'a> Compare for &'a str { + #[inline(always)] + fn compare(&self, t: AsciiChar) -> CompareResult { + self.as_bytes().compare(t) + } +} + +impl<'a> Compare> for &'a str { + #[inline(always)] + fn compare(&self, t: AsciiCaseless) -> CompareResult { + self.as_bytes().compare(t) + } +} + impl<'a> Compare for &'a str { #[inline(always)] fn compare(&self, t: char) -> CompareResult { @@ -2362,6 +2402,34 @@ impl<'i> FindSlice<(u8, u8, u8)> for &'i [u8] { } } +impl<'i> FindSlice for &'i [u8] { + #[inline(always)] + fn find_slice(&self, substr: AsciiChar) -> Option { + self.find_slice(substr.to_u8()) + } +} + +impl<'i> FindSlice<(AsciiChar,)> for &'i [u8] { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar,)) -> Option { + self.find_slice(substr.0.to_u8()) + } +} + +impl<'i> FindSlice<(AsciiChar, AsciiChar)> for &'i [u8] { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar, AsciiChar)) -> Option { + self.find_slice((substr.0.to_u8(), substr.1.to_u8())) + } +} + +impl<'i> FindSlice<(AsciiChar, AsciiChar, AsciiChar)> for &'i [u8] { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar, AsciiChar, AsciiChar)) -> Option { + self.find_slice((substr.0.to_u8(), substr.1.to_u8(), substr.2.to_u8())) + } +} + impl<'i, 's> FindSlice<&'s str> for &'i [u8] { #[inline(always)] fn find_slice(&self, substr: &'s str) -> Option { @@ -2425,6 +2493,34 @@ impl<'i, 's> FindSlice<(&'s str, &'s str, &'s str)> for &'i str { } } +impl<'i> FindSlice for &'i str { + #[inline(always)] + fn find_slice(&self, substr: AsciiChar) -> Option { + self.as_bytes().find_slice(substr) + } +} + +impl<'i> FindSlice<(AsciiChar,)> for &'i str { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar,)) -> Option { + self.as_bytes().find_slice(substr) + } +} + +impl<'i> FindSlice<(AsciiChar, AsciiChar)> for &'i str { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar, AsciiChar)) -> Option { + self.as_bytes().find_slice(substr) + } +} + +impl<'i> FindSlice<(AsciiChar, AsciiChar, AsciiChar)> for &'i str { + #[inline(always)] + fn find_slice(&self, substr: (AsciiChar, AsciiChar, AsciiChar)) -> Option { + self.as_bytes().find_slice(substr) + } +} + impl<'i> FindSlice for &'i str { #[inline(always)] fn find_slice(&self, substr: char) -> Option { @@ -2877,6 +2973,21 @@ impl<'i, T: Clone> Accumulate<&'i [T]> for Vec { } } +#[cfg(feature = "alloc")] +impl Accumulate for String { + #[inline(always)] + fn initial(capacity: Option) -> Self { + match capacity { + Some(capacity) => String::with_capacity(clamp_capacity::(capacity)), + None => String::new(), + } + } + #[inline(always)] + fn accumulate(&mut self, acc: AsciiChar) { + self.push(acc.to_char()); + } +} + #[cfg(feature = "alloc")] impl Accumulate for String { #[inline(always)] @@ -3163,6 +3274,84 @@ impl<'a> AsChar for &'a u8 { } } +impl AsChar for AsciiChar { + #[inline(always)] + fn as_char(self) -> char { + self.to_char() + } + #[inline(always)] + fn is_alpha(self) -> bool { + self.to_u8().is_alpha() + } + #[inline(always)] + fn is_alphanum(self) -> bool { + self.to_u8().is_alphanum() + } + #[inline(always)] + fn is_dec_digit(self) -> bool { + self.to_u8().is_dec_digit() + } + #[inline(always)] + fn is_hex_digit(self) -> bool { + self.to_u8().is_hex_digit() + } + #[inline(always)] + fn is_oct_digit(self) -> bool { + self.to_u8().is_oct_digit() + } + #[inline(always)] + fn len(self) -> usize { + self.to_u8().len() + } + #[inline(always)] + fn is_space(self) -> bool { + self.to_u8().is_space() + } + #[inline(always)] + fn is_newline(self) -> bool { + self.to_u8().is_newline() + } +} + +impl<'a> AsChar for &'a AsciiChar { + #[inline(always)] + fn as_char(self) -> char { + (*self).as_char() + } + #[inline(always)] + fn is_alpha(self) -> bool { + (*self).is_alpha() + } + #[inline(always)] + fn is_alphanum(self) -> bool { + (*self).is_alphanum() + } + #[inline(always)] + fn is_dec_digit(self) -> bool { + (*self).is_dec_digit() + } + #[inline(always)] + fn is_hex_digit(self) -> bool { + (*self).is_hex_digit() + } + #[inline(always)] + fn is_oct_digit(self) -> bool { + (*self).is_oct_digit() + } + #[inline(always)] + fn len(self) -> usize { + (*self).len() + } + #[inline(always)] + fn is_space(self) -> bool { + (*self).is_space() + } + #[inline(always)] + fn is_newline(self) -> bool { + (*self).is_newline() + } +} + impl AsChar for char { #[inline(always)] fn as_char(self) -> char { @@ -3299,6 +3488,13 @@ impl<'a> ContainsToken<&'a char> for u8 { } } +impl ContainsToken for AsciiChar { + #[inline(always)] + fn contains_token(&self, token: C) -> bool { + self.to_char() == token.as_char() + } +} + impl ContainsToken for char { #[inline(always)] fn contains_token(&self, token: C) -> bool { @@ -3374,6 +3570,14 @@ impl ContainsToken for &'_ [u8] { } } +impl ContainsToken for &'_ [AsciiChar] { + #[inline] + fn contains_token(&self, token: C) -> bool { + let token = token.as_char(); + self.iter().any(|t| t.to_char() == token) + } +} + impl ContainsToken for &'_ [char] { #[inline] fn contains_token(&self, token: C) -> bool { @@ -3390,6 +3594,14 @@ impl ContainsToken for &'_ [u8; LEN] { } } +impl ContainsToken for &'_ [AsciiChar; LEN] { + #[inline] + fn contains_token(&self, token: C) -> bool { + let token = token.as_char(); + self.iter().any(|t| t.to_char() == token) + } +} + impl ContainsToken for &'_ [char; LEN] { #[inline] fn contains_token(&self, token: C) -> bool { @@ -3406,6 +3618,14 @@ impl ContainsToken for [u8; LEN] { } } +impl ContainsToken for [AsciiChar; LEN] { + #[inline] + fn contains_token(&self, token: C) -> bool { + let token = token.as_char(); + self.iter().any(|t| t.to_char() == token) + } +} + impl ContainsToken for [char; LEN] { #[inline] fn contains_token(&self, token: C) -> bool { diff --git a/src/token/tests.rs b/src/token/tests.rs index d2e6aeb82..b80cd3025 100644 --- a/src/token/tests.rs +++ b/src/token/tests.rs @@ -10,6 +10,7 @@ use crate::error::ErrorKind; use crate::error::InputError; use crate::error::Needed; use crate::stream::AsChar; +use crate::stream::A; use crate::token::literal; use crate::unpeek; use crate::IResult; @@ -190,9 +191,9 @@ fn complete_literal_fixed_size_array() { } #[test] -fn complete_literal_char() { +fn complete_literal_byte() { fn test(i: &[u8]) -> IResult<&[u8], &[u8]> { - literal('B').parse_peek(i) + literal(b'B').parse_peek(i) } assert_eq!(test(&[0x42, 0x00][..]), Ok((&b"\x00"[..], &b"\x42"[..]))); assert_eq!( @@ -205,9 +206,24 @@ fn complete_literal_char() { } #[test] -fn complete_literal_byte() { +fn complete_literal_ascii_char() { fn test(i: &[u8]) -> IResult<&[u8], &[u8]> { - literal(b'B').parse_peek(i) + literal(A!('B')).parse_peek(i) + } + assert_eq!(test(&[0x42, 0x00][..]), Ok((&b"\x00"[..], &b"\x42"[..]))); + assert_eq!( + test(&[b'A', b'\0'][..]), + Err(ErrMode::Backtrack(error_position!( + &&b"A\0"[..], + ErrorKind::Tag + ))) + ); +} + +#[test] +fn complete_literal_char() { + fn test(i: &[u8]) -> IResult<&[u8], &[u8]> { + literal('B').parse_peek(i) } assert_eq!(test(&[0x42, 0x00][..]), Ok((&b"\x00"[..], &b"\x42"[..]))); assert_eq!(