diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9fb4ee79..a8aaa20f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.56.0 + - uses: dtolnay/rust-toolchain@1.79.0 - run: cargo check minimal-versions: diff --git a/Cargo.toml b/Cargo.toml index f7944de1..2c72f53d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ repository = "https://github.com/tafia/quick-xml" keywords = ["xml", "serde", "parser", "writer", "html"] categories = ["asynchronous", "encoding", "parsing", "parser-implementations"] license = "MIT" -rust-version = "1.56" +rust-version = "1.79" # We exclude tests & examples & benches to reduce the size of a package. # Unfortunately, this is source of warnings in latest cargo when packaging: # > warning: ignoring {context} `{name}` as `{path}` is not included in the published package @@ -213,11 +213,6 @@ name = "async-tokio" required-features = ["async-tokio"] path = "tests/async-tokio.rs" -[[test]] -name = "encodings" -required-features = ["encoding"] -path = "tests/encodings.rs" - [[test]] name = "html" required-features = ["escape-html"] diff --git a/Changelog.md b/Changelog.md index 56b6e81f..293c8b04 100644 --- a/Changelog.md +++ b/Changelog.md @@ -32,6 +32,13 @@ - `Attribute::decode_and_unescape_value_with()` Deprecated functions now behaves the same as newly added. +-[#947]: Add new constructors to `Reader` and `NsReader` that perform automatic streaming UTF-8 + validation on the underlying input. Validation failures are raised as errors when the `Reader` + is used. These APIs are currently considered "experimental". + - `Reader::from_reader_validating()` + - `Reader::from_file_validating()` + - `NsReader::from_reader_validating()` + - `NsReader::from_file_validating()` ### Bug Fixes @@ -46,11 +53,13 @@ accepts `XmlVersion` parameter to apply correct EOL normalization rules. - [#944]: `read_text()` now returns `BytesText` which allows you to get the content with properly normalized EOLs. To get the previous behavior use `.read_text().decode()?`. +- [#947]: Bumped MSRV from 1.59 (Feb 2022) to 1.79 (June 2024) [#371]: https://github.com/tafia/quick-xml/issues/371 [#914]: https://github.com/tafia/quick-xml/pull/914 [#938]: https://github.com/tafia/quick-xml/pull/938 [#944]: https://github.com/tafia/quick-xml/pull/944 +[#947]: https://github.com/tafia/quick-xml/pull/947 ## 0.39.2 -- 2026-02-20 diff --git a/README.md b/README.md index 28c69dfe..8bd01e7e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Crate](https://img.shields.io/crates/v/quick-xml.svg)](https://crates.io/crates/quick-xml) [![docs.rs](https://docs.rs/quick-xml/badge.svg)](https://docs.rs/quick-xml) [![codecov](https://img.shields.io/codecov/c/github/tafia/quick-xml)](https://codecov.io/gh/tafia/quick-xml) -[![MSRV](https://img.shields.io/badge/rustc-1.56.0+-ab6000.svg)](https://blog.rust-lang.org/2021/10/21/Rust-1.56.0.html) +[![MSRV](https://img.shields.io/badge/rustc-1.79.0+-ab6000.svg)](https://blog.rust-lang.org/2024/06/13/Rust-1.79.0/) High performance xml pull reader/writer. diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 6955e6ad..deae040d 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -1,7 +1,3 @@ -// std::hint::black_box stable since 1.66, but our MSRV = 1.56. -// criterion::black_box is deprecated in since criterion 0.7. -// Running benchmarks assumed on current Rust version, so this should be fine -#![allow(clippy::incompatible_msrv)] use criterion::{self, criterion_group, criterion_main, Criterion, Throughput}; use quick_xml::events::Event; use quick_xml::reader::{NsReader, Reader}; diff --git a/benches/microbenches.rs b/benches/microbenches.rs index bb2e9649..985595a2 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -1,7 +1,3 @@ -// std::hint::black_box stable since 1.66, but our MSRV = 1.56. -// criterion::black_box is deprecated in since criterion 0.7. -// Running benchmarks assumed on current Rust version, so this should be fine -#![allow(clippy::incompatible_msrv)] use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::escape::{escape, unescape}; diff --git a/src/encoding.rs b/src/encoding.rs index d894f950..40744bbf 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,23 +1,77 @@ //! A module for wrappers that encode / decode data. use std::borrow::Cow; +use std::io::{self, Read}; use std::str::Utf8Error; #[cfg(feature = "encoding")] -use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use encoding_rs; /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8. /// See pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF]; /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order. /// See -#[cfg(feature = "encoding")] pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE]; /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order. /// See -#[cfg(feature = "encoding")] pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF]; +/// An error type representing UTF-8 validation failure. +/// +/// Unlike [`std::str::Utf8Error`], instances can be created directly for custom error scenarios. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Utf8ValidationError { + /// Error from standard library UTF-8 validation + Utf8(Utf8Error), + /// Invalid UTF-8 sequence found in the input + InvalidSequence { + /// Length of the invalid UTF-8 sequence in bytes + error_len: usize, + }, + /// Incomplete UTF-8 sequence at end of stream + IncompleteSequence, + /// Non-UTF-8 encoding detected at start of stream + NonUtf8EncodingDetected(DetectedEncoding), +} + +impl From for Utf8ValidationError { + #[inline] + fn from(e: Utf8Error) -> Self { + Self::Utf8(e) + } +} + +impl std::fmt::Display for Utf8ValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Utf8(e) => write!(f, "{}", e), + Self::InvalidSequence { error_len } => { + write!(f, "invalid UTF-8 sequence of {} bytes", error_len) + } + Self::IncompleteSequence => { + write!(f, "incomplete UTF-8 sequence at end of stream") + } + Self::NonUtf8EncodingDetected(detected) => { + write!( + f, + "non-UTF-8 encoding detected at start of stream: {:?}", + detected + ) + } + } + } +} + +impl std::error::Error for Utf8ValidationError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Utf8(e) => Some(e), + _ => None, + } + } +} + /// An error when decoding or encoding /// /// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`] @@ -27,15 +81,22 @@ pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF]; #[non_exhaustive] pub enum EncodingError { /// Input was not valid UTF-8 - Utf8(Utf8Error), + Utf8(Utf8ValidationError), /// Input did not adhere to the given encoding #[cfg(feature = "encoding")] - Other(&'static Encoding), + Other(&'static encoding_rs::Encoding), } impl From for EncodingError { #[inline] fn from(e: Utf8Error) -> Self { + Self::Utf8(e.into()) + } +} + +impl From for EncodingError { + #[inline] + fn from(e: Utf8ValidationError) -> Self { Self::Utf8(e) } } @@ -77,20 +138,22 @@ impl std::fmt::Display for EncodingError { #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Decoder { #[cfg(feature = "encoding")] - pub(crate) encoding: &'static Encoding, + pub(crate) encoding: &'static encoding_rs::Encoding, } impl Decoder { pub(crate) const fn utf8() -> Self { Decoder { #[cfg(feature = "encoding")] - encoding: UTF_8, + encoding: encoding_rs::UTF_8, } } #[cfg(all(test, feature = "encoding", feature = "serialize"))] pub(crate) const fn utf16() -> Self { - Decoder { encoding: UTF_16LE } + Decoder { + encoding: encoding_rs::UTF_16LE, + } } } @@ -101,7 +164,7 @@ impl Decoder { /// /// [`decode`]: Self::decode #[cfg(feature = "encoding")] - pub const fn encoding(&self) -> &'static Encoding { + pub const fn encoding(&self) -> &'static encoding_rs::Encoding { self.encoding } @@ -182,7 +245,7 @@ impl Decoder { #[cfg(feature = "encoding")] pub fn decode<'b>( bytes: &'b [u8], - encoding: &'static Encoding, + encoding: &'static encoding_rs::Encoding, ) -> Result, EncodingError> { encoding .decode_without_bom_handling_and_without_replacement(bytes) @@ -193,10 +256,10 @@ pub fn decode<'b>( #[cfg(feature = "encoding")] pub fn decode_into( bytes: &[u8], - encoding: &'static Encoding, + encoding: &'static encoding_rs::Encoding, buf: &mut String, ) -> Result<(), EncodingError> { - if encoding == UTF_8 { + if encoding == encoding_rs::UTF_8 { buf.push_str(std::str::from_utf8(bytes)?); return Ok(()); } @@ -211,22 +274,22 @@ pub fn decode_into( ); let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true); match result { - DecoderResult::InputEmpty => { + encoding_rs::DecoderResult::InputEmpty => { debug_assert_eq!(read, bytes.len()); Ok(()) } - DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)), + encoding_rs::DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)), // SAFETY: We allocate enough space above - DecoderResult::OutputFull => unreachable!(), + encoding_rs::DecoderResult::OutputFull => unreachable!(), } } /// Automatic encoding detection of XML files based using the /// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing). /// -/// If encoding is detected, `Some` is returned with an encoding and size of BOM -/// in bytes, if detection was performed using BOM, or zero, if detection was -/// performed without BOM. +/// If encoding is detected, `Some` is returned with a [`DetectedEncoding`], from which +/// can be derived the size of the BOM in bytes, if detection was performed using BOM +/// - or zero, if detection was performed without BOM. /// /// IF encoding was not recognized, `None` is returned. /// @@ -246,21 +309,1053 @@ pub fn decode_into( /// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) /// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) /// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably -#[cfg(feature = "encoding")] -pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> { +pub fn detect_encoding(bytes: &[u8]) -> Option { // Prevent suggesting " Some((UTF_16BE, 2)), - _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)), - _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)), + _ if bytes.starts_with(UTF16_BE_BOM) => Some(DetectedEncoding::Utf16BeBom), + _ if bytes.starts_with(UTF16_LE_BOM) => Some(DetectedEncoding::Utf16LeBom), + _ if bytes.starts_with(UTF8_BOM) => Some(DetectedEncoding::Utf8Bom), // without BOM - _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(DetectedEncoding::Utf16Be), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(DetectedEncoding::Utf16Le), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => { + Some(DetectedEncoding::AsciiCompatible) + } // Some ASCII compatible _ => None, } } + +/// Possible scenarios for start-of-xml detection of encoding +/// +/// See the documentation of [`detect_encoding`] +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum DetectedEncoding { + /// Matches UTF-8 or some other ascii-compatible encoding + AsciiCompatible, + /// We saw a UTF-8 BOM + Utf8Bom, + /// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2) + Utf16Le, + /// We saw a UTF-16 BOM in little-endian orientation + Utf16LeBom, + /// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2) + Utf16Be, + /// We saw a UTF-16 BOM in big-endian orientation + Utf16BeBom, +} + +impl DetectedEncoding { + /// Return an Encoding object appropriate for the detected encoding + #[cfg(feature = "encoding")] + pub fn encoding(&self) -> &'static encoding_rs::Encoding { + match self { + DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf8Bom => encoding_rs::UTF_8, + DetectedEncoding::Utf16Le | DetectedEncoding::Utf16LeBom => encoding_rs::UTF_16LE, + DetectedEncoding::Utf16Be | DetectedEncoding::Utf16BeBom => encoding_rs::UTF_16BE, + } + } + + /// Length of the BOM, which may need to be stripped from the input + pub fn bom_len(&self) -> usize { + match self { + DetectedEncoding::Utf8Bom => 3, + DetectedEncoding::Utf16LeBom | DetectedEncoding::Utf16BeBom => 2, + DetectedEncoding::AsciiCompatible + | DetectedEncoding::Utf16Le + | DetectedEncoding::Utf16Be => 0, + } + } +} + +/// A struct for transparently decoding / validating bytes as UTF-8. +#[derive(Debug)] +pub struct Utf8BytesReader { + #[cfg(feature = "encoding")] + reader: io::BufReader, + #[cfg(not(feature = "encoding"))] + reader: io::BufReader>, +} + +// TODO: Utf8BytesReader should manage encoding detection and BOM stripping - that responsibility +// can then be removed from the readers, with perhaps an exception for slice reader. +impl Utf8BytesReader { + /// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8. + /// (TODO: well, not yet - right now it's just a dumb wrapper) + /// Note: The consumer is responsible for finding the correct character boundaries when + /// treating a given range of bytes as UTF-8. + #[cfg(feature = "encoding")] + pub fn new(reader: R) -> Self { + Self { + reader: io::BufReader::new(reader), + } + } + + /// Build a new reader which validates UTF-8. + /// Note: The consumer is responsible for finding the correct character boundaries when + /// treating a given range of bytes as UTF-8. + #[cfg(not(feature = "encoding"))] + pub fn new(reader: R) -> Self { + Self { + reader: io::BufReader::new(Utf8ValidatingReader::new(reader)), + } + } +} + +impl io::Read for Utf8BytesReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.reader.read(buf) + } +} + +impl io::BufRead for Utf8BytesReader { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.reader.fill_buf() + } + + fn consume(&mut self, amt: usize) { + self.reader.consume(amt) + } +} + +/// A reader wrapper that ensures only valid UTF-8 bytes are read. +/// +/// This reader uses [`str::from_utf8()`] and [`Utf8Error::valid_up_to()`] to validate +/// that only valid UTF-8 bytes are written to the output buffer. Incomplete UTF-8 +/// sequences at read boundaries are buffered and combined with subsequent reads. +/// +/// Additionally, this reader checks the very beginning of the stream for encoding +/// signatures (BOMs or XML declaration patterns) and rejects streams that appear to +/// be encoded in UTF-16 or other non-UTF-8 encodings. +/// +/// # Examples +/// +/// ``` +/// use std::io::Read; +/// use quick_xml::encoding::Utf8ValidatingReader; +/// +/// let data = b"Hello, \xF0\x9F\x98\x80!"; // "Hello, πŸ˜€!" +/// let mut reader = Utf8ValidatingReader::new(&data[..]); +/// let mut buf = [0u8; 20]; +/// let n = reader.read(&mut buf).unwrap(); +/// assert_eq!(&buf[..n], data); +/// ``` +#[derive(Debug)] +pub struct Utf8ValidatingReader { + inner: R, + /// Buffer to hold incomplete UTF-8 sequences from previous reads (max 3 bytes) + buffer: Vec, + /// Whether we've checked for encoding at the start of the stream + encoding_checked: bool, +} + +impl Utf8ValidatingReader { + /// Creates a new UTF-8 validating reader + pub fn new(inner: R) -> Self { + Self { + inner, + buffer: Vec::with_capacity(4), + encoding_checked: false, + } + } + + /// Returns a reference to the underlying reader + pub fn get_ref(&self) -> &R { + &self.inner + } + + /// Returns a mutable reference to the underlying reader + pub fn get_mut(&mut self) -> &mut R { + &mut self.inner + } + + /// Consumes this reader and returns the underlying reader + pub fn into_inner(self) -> R { + self.inner + } +} + +impl Read for Utf8ValidatingReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if buf.is_empty() { + return Ok(0); + } + + // Check for encoding at the start of the stream + if !self.encoding_checked { + self.encoding_checked = true; + + // Read initial data to detect encoding + // Read enough for encoding detection (4 bytes) plus fill up to caller's buffer size + let read_size = buf.len().max(64); // Read at least 64 bytes for efficiency + let mut temp = vec![0u8; read_size]; + let n = self.inner.read(&mut temp)?; + + if n > 0 { + self.buffer.extend_from_slice(&temp[..n]); + + // Try to detect encoding if we have at least 4 bytes + if self.buffer.len() >= 4 { + if let Some(detected) = detect_encoding(&self.buffer) { + match detected { + DetectedEncoding::Utf8Bom | DetectedEncoding::AsciiCompatible => { + // Strip BOM if present + let bom_len = detected.bom_len(); + if bom_len > 0 { + self.buffer.drain(..bom_len); + } + } + DetectedEncoding::Utf16Le + | DetectedEncoding::Utf16LeBom + | DetectedEncoding::Utf16Be + | DetectedEncoding::Utf16BeBom => { + // Reject UTF-16 encodings + return Err(io::Error::new( + io::ErrorKind::InvalidData, + EncodingError::Utf8( + Utf8ValidationError::NonUtf8EncodingDetected(detected), + ), + )); + } + } + } + } + } + // If we read 0 bytes or less than 4 bytes, assume UTF-8 and continue + } + + loop { + // If we have buffered data, check if it's complete UTF-8 + if !self.buffer.is_empty() { + match std::str::from_utf8(&self.buffer) { + Ok(s) => { + // All buffered bytes are valid UTF-8 + // Find how many complete characters fit in the buffer + let mut bytes_to_copy = 0; + for (idx, _) in s.char_indices() { + if idx > buf.len() { + break; + } + bytes_to_copy = idx; + } + // Also consider the last character + if s.len() <= buf.len() { + bytes_to_copy = s.len(); + } + + if bytes_to_copy == 0 { + // Buffer too small for even one character + return Ok(0); + } + + buf[..bytes_to_copy].copy_from_slice(&self.buffer[..bytes_to_copy]); + self.buffer.drain(..bytes_to_copy); + return Ok(bytes_to_copy); + } + Err(e) => { + let valid_up_to = e.valid_up_to(); + + if let Some(error_len) = e.error_len() { + // Invalid UTF-8 sequence found + if valid_up_to == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { + error_len, + }), + )); + } + // Write valid portion before the error + let len = valid_up_to.min(buf.len()); + buf[..len].copy_from_slice(&self.buffer[..len]); + + // Remove only the valid bytes, leave invalid bytes to error on next read + self.buffer.drain(..valid_up_to); + return Ok(len); + } else { + // Incomplete UTF-8 sequence - need to read more + // But first, if we have valid bytes, return them + if valid_up_to > 0 { + let len = valid_up_to.min(buf.len()); + buf[..len].copy_from_slice(&self.buffer[..len]); + self.buffer.drain(..len); + return Ok(len); + } + // Otherwise fall through to read more data + } + } + } + } + + // Read more data from the underlying reader + let read_size = buf.len().max(64); // Read at least 64 bytes for efficiency + let mut temp = vec![0u8; read_size]; + let n = self.inner.read(&mut temp)?; + + // If we read nothing + if n == 0 { + if self.buffer.is_empty() { + // True EOF with no buffered data + return Ok(0); + } else { + // EOF with incomplete UTF-8 sequence + return Err(io::Error::new( + io::ErrorKind::InvalidData, + EncodingError::Utf8(Utf8ValidationError::IncompleteSequence), + )); + } + } + + // Add newly read data to buffer + self.buffer.extend_from_slice(&temp[..n]); + // Loop back to validate and potentially return data + } + } +} + +#[cfg(test)] +mod utf8_bytes_reader_tests { + use super::*; + use std::io::{BufRead, Read}; + + #[test] + fn basic_read() { + let data = b"Hello, World!"; + let mut reader = Utf8BytesReader::new(&data[..]); + let mut buf = [0u8; 20]; + let n = reader.read(&mut buf).unwrap(); + assert!(n > 0); + assert_eq!(&buf[..n], &data[..n]); + } + + #[test] + fn read_with_multibyte_chars() { + let data = "Hello, δΈ–η•Œ! πŸ˜€".as_bytes(); + let mut reader = Utf8BytesReader::new(&data[..]); + let mut result = Vec::new(); + reader.read_to_end(&mut result).unwrap(); + assert_eq!(result, data); + assert_eq!(std::str::from_utf8(&result).unwrap(), "Hello, δΈ–η•Œ! πŸ˜€"); + } + + #[test] + fn bufread_interface() { + let data = b"Line1\nLine2\nLine3"; + let mut reader = Utf8BytesReader::new(&data[..]); + + // Test fill_buf + let buf = reader.fill_buf().unwrap(); + assert!(!buf.is_empty()); + + // Test consume + let consumed = buf.len().min(5); + reader.consume(consumed); + + // Read remaining + let mut result = Vec::new(); + reader.read_to_end(&mut result).unwrap(); + assert_eq!(result, &data[consumed..]); + } + + #[test] + fn empty_input() { + let data = b""; + let mut reader = Utf8BytesReader::new(&data[..]); + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(n, 0); + } +} + +#[cfg(test)] +mod utf8_validating_reader_tests { + use super::*; + use std::io::{Cursor, Read}; + + /// Helper reader that returns data in fixed-size chunks + struct ChunkedReader<'a> { + data: &'a [u8], + pos: usize, + chunk_size: usize, + } + + impl<'a> ChunkedReader<'a> { + fn new(data: &'a [u8], chunk_size: usize) -> Self { + Self { + data, + pos: 0, + chunk_size, + } + } + } + + impl<'a> Read for ChunkedReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos >= self.data.len() { + return Ok(0); + } + let len = self + .chunk_size + .min(buf.len()) + .min(self.data.len() - self.pos); + buf[..len].copy_from_slice(&self.data[self.pos..self.pos + len]); + self.pos += len; + Ok(len) + } + } + + mod basic_access { + use super::*; + + #[test] + fn test_get_ref() { + let data = b"Hello"; + let cursor = Cursor::new(data.to_vec()); + let reader = Utf8ValidatingReader::new(cursor); + + assert_eq!(reader.get_ref().get_ref(), data); + } + + #[test] + fn test_get_mut() { + let data = b"Hello"; + let cursor = Cursor::new(data.to_vec()); + let mut reader = Utf8ValidatingReader::new(cursor); + + reader.get_mut().set_position(2); + assert_eq!(reader.get_ref().position(), 2); + } + + #[test] + fn test_into_inner() { + let data = b"Hello"; + let cursor = Cursor::new(data.to_vec()); + let reader = Utf8ValidatingReader::new(cursor); + + let inner = reader.into_inner(); + assert_eq!(inner.get_ref(), data); + } + } + + mod valid_utf8 { + use super::*; + + #[test] + fn valid_ascii() { + let data = b"Hello, World!"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 20]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(n, 13); + assert_eq!(&buf[..n], data); + } + + #[test] + fn valid_multibyte_characters() { + // Mix of 1, 2, 3, and 4 byte UTF-8 sequences + let data = "Hello, £€ δΈ–η•Œ! πŸ˜€".as_bytes(); // ASCII + 2x2-byte + ASCII + 2x3-byte + ASCII + 4-byte + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = vec![0u8; 100]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], data); + } + + #[test] + fn empty_input() { + let data = b""; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(n, 0); + } + + #[test] + fn empty_buffer() { + let data = b"Hello, World!"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + + // Read with empty buffer - should return 0 without affecting state + let mut empty_buf = []; + let n = reader.read(&mut empty_buf).unwrap(); + assert_eq!(n, 0); + + // Read with actual buffer - should get data + let mut buf = [0u8; 5]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(n, 5); + assert_eq!(&buf[..n], b"Hello"); + + // Read with empty buffer again - should return 0 without affecting state + let n = reader.read(&mut empty_buf).unwrap(); + assert_eq!(n, 0); + + // Read remaining data - should continue from where we left off + let mut buf2 = [0u8; 20]; + let n = reader.read(&mut buf2).unwrap(); + assert_eq!(&buf2[..n], b", World!"); + } + + #[test] + fn two_byte_char_boundary() { + // Β£ is 0xC2 0xA3 in UTF-8 + let data = b"Hi\xC2\xA3"; + + let mut reader = Utf8ValidatingReader::new(ChunkedReader::new(data, 1)); + let mut result = Vec::new(); + + loop { + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + } + + #[test] + fn three_byte_char_boundary() { + // δΈ– is 0xE4 0xB8 0x96 in UTF-8 + let data = "HiδΈ–".as_bytes(); + + let mut reader = Utf8ValidatingReader::new(ChunkedReader::new(data, 1)); + let mut result = Vec::new(); + + loop { + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + } + + #[test] + fn four_byte_char_boundary() { + // πŸ˜€ is 0xF0 0x9F 0x98 0x80 in UTF-8 + let data = "HiπŸ˜€".as_bytes(); + + let mut reader = Utf8ValidatingReader::new(ChunkedReader::new(data, 1)); + let mut result = Vec::new(); + + loop { + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + } + + #[test] + fn consecutive_valid_multibyte() { + // Multiple 2-byte chars in a row + let data = "£€Β₯".as_bytes(); + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 20]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], data); + assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "£€Β₯"); + } + + #[test] + fn read_exactly_at_char_boundary() { + let data = "HiδΈ–".as_bytes(); // 2 ASCII + 3-byte char = 5 bytes + let mut reader = Utf8ValidatingReader::new(&data[..]); + + // Read exactly the size + let mut buf = [0u8; 5]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(n, 5); + assert_eq!(&buf[..n], data); + } + + #[test] + fn multiple_multibyte_chars() { + let data = "δΈ–η•ŒπŸ˜€πŸŽ‰".as_bytes(); + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = vec![0u8; 100]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], data); + assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "δΈ–η•ŒπŸ˜€πŸŽ‰"); + } + + #[test] + fn partial_read_with_buffering() { + // Create data where multibyte char is at boundary + let data = "abπŸ˜€cd".as_bytes(); // a, b, [4-byte emoji], c, d + + // Read 3 bytes at a time - will split the 4-byte emoji + let mut reader = Utf8ValidatingReader::new(ChunkedReader::new(data, 3)); + + let mut result = Vec::new(); + loop { + let mut buf = [0u8; 20]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + assert_eq!(std::str::from_utf8(&result).unwrap(), "abπŸ˜€cd"); + } + } + + mod invalid_utf8 { + use super::*; + + #[test] + fn incomplete_sequence_at_eof() { + // Incomplete 2-byte sequence at end + let data = b"Hi\xC2"; // Missing second byte of Β£ + let mut reader = Utf8ValidatingReader::new(&data[..]); + + let mut buf = [0u8; 10]; + let n1 = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n1], b"Hi"); + + // Second read should fail because incomplete sequence at EOF + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's the IncompleteSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {} + other => panic!("Expected IncompleteSequence error, got: {:?}", other), + } + } + + #[test] + fn invalid_utf8_start_byte() { + // 0xFF is never valid in UTF-8 + let data = b"\xFF"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)"); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn invalid_utf8_continuation() { + // 0xC2 should be followed by 0x80-0xBF, not 0x00 + let data = b"\xC2\x00"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence"); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn invalid_utf8_with_valid_prefix() { + // Valid UTF-8 followed by invalid + let data = b"OK\xFF"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], b"OK"); + + // Second read should error on invalid byte + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)"); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn multiple_reads() { + let data = "Hello, δΈ–η•Œ! πŸ˜€ Test".as_bytes(); + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut result = Vec::new(); + + // Read in small chunks + loop { + let mut buf = [0u8; 5]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + } + + #[test] + fn very_small_buffer() { + let data = "πŸ˜€".as_bytes(); // 4 bytes + let mut reader = Utf8ValidatingReader::new(&data[..]); + + // Buffer smaller than character + let mut buf = [0u8; 2]; + let n1 = reader.read(&mut buf).unwrap(); + + // Should buffer the incomplete sequence + assert_eq!(n1, 0); + + // Larger buffer should get the character + let mut buf2 = [0u8; 10]; + let n2 = reader.read(&mut buf2).unwrap(); + assert_eq!(&buf2[..n2], data); + } + + #[test] + fn split_4byte_char_across_multiple_reads() { + // πŸ˜€ is 0xF0 0x9F 0x98 0x80 + let data = b"\xF0\x9F\x98\x80"; + + let mut reader = Utf8ValidatingReader::new(ChunkedReader::new(data, 2)); + let mut result = Vec::new(); + + loop { + let mut buf = [0u8; 10]; + let n = reader.read(&mut buf).unwrap(); + if n == 0 { + break; + } + result.extend_from_slice(&buf[..n]); + } + + assert_eq!(result, data); + } + + #[test] + fn mixed_valid_and_invalid() { + // Valid, invalid, valid - but we error on invalid so never see "More" + let data = b"OK\xFFMore"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + + let mut buf = [0u8; 20]; + + // First read gets "OK" + let n1 = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n1], b"OK"); + + // Second read should error on invalid byte (never reaches "More") + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)"); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn all_invalid_bytes() { + let data = b"\xFF\xFE\xFD"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's the expected error variant + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!(*error_len, 1, "Expected 1-byte invalid sequence (0xFF)"); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn incomplete_3byte_at_eof() { + // Incomplete 3-byte sequence + let data = b"Hi\xE4\xB8"; // Missing third byte of δΈ– + let mut reader = Utf8ValidatingReader::new(&data[..]); + + let mut buf = [0u8; 10]; + let n1 = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n1], b"Hi"); + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's the IncompleteSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {} + other => panic!("Expected IncompleteSequence error, got: {:?}", other), + } + } + + #[test] + fn incomplete_4byte_at_eof() { + // Incomplete 4-byte sequence + let data = b"Hi\xF0\x9F\x98"; // Missing fourth byte of πŸ˜€ + let mut reader = Utf8ValidatingReader::new(&data[..]); + + let mut buf = [0u8; 10]; + let n1 = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n1], b"Hi"); + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's the IncompleteSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::IncompleteSequence) => {} + other => panic!("Expected IncompleteSequence error, got: {:?}", other), + } + } + + #[test] + fn overlong_encoding() { + // Overlong encoding of '/' (0x2F) + // Valid: 0x2F + // Overlong 2-byte: 0xC0 0xAF (invalid) + let data = b"\xC0\xAF"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!( + *error_len, 1, + "Expected 1-byte invalid sequence (0xC0 is invalid)" + ); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + + #[test] + fn surrogate_pairs() { + // UTF-16 surrogate pairs are invalid in UTF-8 + // 0xED 0xA0 0x80 (U+D800, invalid surrogate) + let data = b"\xED\xA0\x80"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 10]; + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's an InvalidSequence error + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::InvalidSequence { error_len }) => { + assert_eq!( + *error_len, 1, + "Expected 1-byte invalid sequence (0xED starts surrogate)" + ); + } + other => panic!("Expected InvalidSequence error, got: {:?}", other), + } + } + } + + mod encoding_detection { + use super::*; + + #[test] + fn utf8_bom_stripped() { + // UTF-8 BOM (0xEF 0xBB 0xBF) followed by "Hello" + let data = b"\xEF\xBB\xBFHello"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 20]; + let n = reader.read(&mut buf).unwrap(); + + // BOM should be stripped, only "Hello" should be returned + assert_eq!(&buf[..n], b"Hello"); + assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "Hello"); + } + + #[test] + fn utf16le_bom_rejected() { + // UTF-16 LE BOM (0xFF 0xFE) + let data = b"\xFF\xFE() + .expect("Error should downcast to EncodingError"); + + // Verify it's the NonUtf8EncodingDetected error with the correct encoding + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => { + assert_eq!(*detected, DetectedEncoding::Utf16LeBom); + } + other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other), + } + } + + #[test] + fn utf16be_bom_rejected() { + // UTF-16 BE BOM (0xFE 0xFF) + let data = b"\xFE\xFF\x00<\x00?"; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 20]; + + let result = reader.read(&mut buf); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::InvalidData); + + // Verify the error can be downcast to EncodingError + let encoding_err = err + .get_ref() + .unwrap() + .downcast_ref::() + .expect("Error should downcast to EncodingError"); + + // Verify it's the NonUtf8EncodingDetected error with the correct encoding + match encoding_err { + EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => { + assert_eq!(*detected, DetectedEncoding::Utf16BeBom); + } + other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other), + } + } + + #[test] + fn ascii_compatible_encoding_allowed() { + // ASCII-compatible XML declaration (no BOM) + let data = b""; + let mut reader = Utf8ValidatingReader::new(&data[..]); + let mut buf = [0u8; 50]; + + let n = reader.read(&mut buf).unwrap(); + assert_eq!(&buf[..n], data); + } + } +} diff --git a/src/errors.rs b/src/errors.rs index 9002f047..7cd970be 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -5,7 +5,7 @@ use crate::escape::EscapeError; use crate::events::attributes::AttrError; use crate::name::{NamespaceError, QName}; use std::fmt; -use std::io::Error as IoError; +use std::io::{Error as IoError, ErrorKind as IoErrorKind}; use std::sync::Arc; /// An error returned if parsed document does not correspond to the XML grammar, @@ -221,7 +221,12 @@ impl From for Error { /// Creates a new `Error::Io` from the given error #[inline] fn from(error: IoError) -> Error { - Self::Io(Arc::new(error)) + match error.kind() { + IoErrorKind::InvalidData => Self::Encoding(error.downcast::().expect( + "Got an IoError::InvalidData, but it wasn't downcastable to EncodingError?", + )), + _ => Self::Io(Arc::new(error)), + } } } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 9798d846..d8e871af 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -151,10 +151,10 @@ impl<'a> Attribute<'a> { use std::str::from_utf8; let decoded = match &self.value { - Cow::Borrowed(bytes) => Cow::Borrowed(from_utf8(bytes).map_err(EncodingError::Utf8)?), + Cow::Borrowed(bytes) => Cow::Borrowed(from_utf8(bytes).map_err(EncodingError::from)?), // Convert to owned, because otherwise Cow will be bound with wrong lifetime Cow::Owned(bytes) => { - Cow::Owned(from_utf8(bytes).map_err(EncodingError::Utf8)?.to_owned()) + Cow::Owned(from_utf8(bytes).map_err(EncodingError::from)?.to_owned()) } }; diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 62b97fbd..756077a0 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -5,8 +5,7 @@ use std::pin::Pin; use std::task::{Context, Poll}; -use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf}; - +use crate::encoding; use crate::errors::{Error, IllFormedError, Result, SyntaxError}; use crate::events::{BytesRef, BytesText, Event}; use crate::name::{QName, ResolveResult}; @@ -16,6 +15,7 @@ use crate::reader::{ BangType, BinaryStream, NsReader, ParseState, ReadRefResult, ReadTextResult, Reader, Span, }; use crate::utils::is_whitespace; +use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf}; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. /// diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index ee418eb5..59aca686 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,6 +5,7 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; +use crate::encoding::{self, Utf8BytesReader}; use crate::errors::{Error, Result}; use crate::events::{BytesText, Event}; use crate::name::QName; @@ -17,13 +18,11 @@ macro_rules! impl_buffered_source { #[cfg(not(feature = "encoding"))] #[inline] $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> { - use crate::encoding::UTF8_BOM; - loop { break match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) => { - if n.starts_with(UTF8_BOM) { - self $(.$reader)? .consume(UTF8_BOM.len()); + if n.starts_with(encoding::UTF8_BOM) { + self $(.$reader)? .consume(encoding::UTF8_BOM.len()); } Ok(()) }, @@ -35,12 +34,12 @@ macro_rules! impl_buffered_source { #[cfg(feature = "encoding")] #[inline] - $($async)? fn detect_encoding(&mut self) -> io::Result> { + $($async)? fn detect_encoding(&mut self) -> io::Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) { - self $(.$reader)? .consume(bom_len); - Ok(Some(enc)) + Ok(n) => if let Some(detected) = encoding::detect_encoding(n) { + self $(.$reader)? .consume(detected.bom_len()); + Ok(Some(detected)) } else { Ok(None) }, @@ -579,6 +578,18 @@ impl Reader> { } } +impl Reader> { + /// Creates an XML reader from a file path. + /// + /// The reader will validate that all bytes read from the file are valid UTF-8. + /// If invalid UTF-8 is encountered, an error will be returned when reading events. + #[cfg(not(feature = "encoding"))] + pub fn from_file_validating>(path: P) -> Result { + let file = File::open(path)?; + Ok(Self::from_reader_validating(file)) + } +} + #[cfg(test)] mod test { use crate::reader::test::check; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index b8a569b2..da31a22e 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -5,7 +5,9 @@ use encoding_rs::Encoding; use std::io; use std::ops::Range; -use crate::encoding::Decoder; +#[cfg(feature = "encoding")] +use crate::encoding::DetectedEncoding; +use crate::encoding::{Decoder, Utf8BytesReader}; use crate::errors::{Error, IllFormedError, SyntaxError}; use crate::events::{BytesRef, Event}; use crate::parser::{DtdParser, ElementParser, Parser, PiParser}; @@ -267,7 +269,7 @@ macro_rules! read_event_impl { #[cfg(feature = "encoding")] if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? { if $self.state.encoding.can_be_refined() { - $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding); + $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding.encoding()); } } @@ -776,7 +778,7 @@ impl Reader { /// Creates a `Reader` that reads from a given reader. pub fn from_reader(reader: R) -> Self { Self { - reader, + reader: reader, state: ReaderState::default(), } } @@ -792,6 +794,35 @@ impl Reader { } } +impl Reader> { + /// Creates a `Reader` that reads from a given reader with UTF-8 validation. + /// + /// This constructor wraps the input reader in a [`Utf8BytesReader`], which validates + /// that all bytes read are valid UTF-8. If invalid UTF-8 is encountered, an error + /// will be returned when reading events. + /// + /// # Experimental + /// + /// **This API is experimental and may change (or disappear) in future versions.** + /// + /// # Examples + /// + /// ``` + /// use quick_xml::reader::Reader; + /// use std::io::Cursor; + /// + /// let data = Cursor::new(b"content".to_vec()); + /// let reader = Reader::from_reader_validating(data); + /// // Reader will validate UTF-8 as it reads + /// ``` + /// + /// [`Utf8BytesReader`]: crate::encoding::Utf8BytesReader + #[cfg(not(feature = "encoding"))] + pub fn from_reader_validating(reader: R) -> Self { + Self::from_reader(Utf8BytesReader::new(reader)) + } +} + /// Getters impl Reader { /// Consumes `Reader` returning the underlying reader @@ -1057,7 +1088,7 @@ trait XmlSource<'r, B> { /// Determines encoding from the start of input and removes BOM if it is present #[cfg(feature = "encoding")] - fn detect_encoding(&mut self) -> io::Result>; + fn detect_encoding(&mut self) -> io::Result>; /// Read input until start of markup (the `<`) is found, start of general entity /// reference (the `&`) is found or end of input is reached. diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 46858cc8..4b484b92 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -9,6 +9,7 @@ use std::io::{BufRead, BufReader}; use std::ops::Deref; use std::path::Path; +use crate::encoding::Utf8BytesReader; use crate::errors::Result; use crate::events::{BytesText, Event}; use crate::name::{NamespaceResolver, QName, ResolveResult}; @@ -50,6 +51,36 @@ impl NsReader { } } +impl NsReader> { + /// Creates an `NsReader` that reads from a given reader with UTF-8 validation. + /// + /// This constructor wraps the input reader in a [`Utf8BytesReader`], which validates + /// that all bytes read are valid UTF-8. If invalid UTF-8 is encountered, an error + /// will be returned when reading events. + /// + /// # Experimental + /// + /// **This API is experimental and may change (or disappear) in future versions.** + /// + /// # Examples + /// + /// ``` + /// use quick_xml::reader::NsReader; + /// use std::io::Cursor; + /// + /// let data = Cursor::new(b"content".to_vec()); + /// let reader = NsReader::from_reader_validating(data); + /// // Reader will validate UTF-8 as it reads + /// ``` + /// + /// [`Utf8BytesReader`]: crate::encoding::Utf8BytesReader + #[inline] + #[cfg(not(feature = "encoding"))] + pub fn from_reader_validating(reader: R) -> Self { + Self::new(Reader::from_reader_validating(reader)) + } +} + /// Private methods impl NsReader { #[inline] @@ -442,6 +473,18 @@ impl NsReader> { } } +impl NsReader> { + /// Creates an XML reader from a file path. + /// + /// If the [`encoding`] feature is *not* enabled, the reader will validate that all + /// bytes read from the file are valid UTF-8. If invalid UTF-8 is encountered, an + /// error will be returned when reading events. + #[cfg(not(feature = "encoding"))] + pub fn from_file_validating>(path: P) -> Result { + Ok(Self::new(Reader::from_file_validating(path)?)) + } +} + impl<'i> NsReader<&'i [u8]> { /// Creates an XML reader from a string slice. #[inline] diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 65ac2796..e744794a 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -4,10 +4,12 @@ use std::io; +#[cfg(feature = "encoding")] +use crate::encoding::DetectedEncoding; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_8}; +use encoding_rs; use crate::errors::{Error, Result}; use crate::events::{BytesText, Event}; @@ -27,7 +29,7 @@ impl<'a> Reader<&'a [u8]> { #[cfg(feature = "encoding")] { let mut reader = Self::from_reader(s.as_bytes()); - reader.state.encoding = EncodingRef::Explicit(UTF_8); + reader.state.encoding = EncodingRef::Explicit(encoding_rs::UTF_8); reader } @@ -253,10 +255,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { #[cfg(feature = "encoding")] #[inline] - fn detect_encoding(&mut self) -> io::Result> { - if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { - *self = &self[bom_len..]; - return Ok(Some(enc)); + fn detect_encoding(&mut self) -> io::Result> { + if let Some(detected) = crate::encoding::detect_encoding(self) { + *self = &self[detected.bom_len() as usize..]; + return Ok(Some(detected)); } Ok(None) } diff --git a/src/reader/state.rs b/src/reader/state.rs index a0ee3b96..d1ab55ef 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -280,6 +280,8 @@ impl ReaderState { if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder())); + // TODO: once we can assume that the parser is operating on UTF-8, then we can throw + // an error here if we see a non-UTF-8 encoding... if encoding/decoding is not enabled. // Try getting encoding from the declaration event #[cfg(feature = "encoding")] if self.encoding.can_be_refined() { diff --git a/tests/encodings.rs b/tests/encodings.rs index b906f3b6..79c8a512 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -1,8 +1,10 @@ +#[cfg(feature = "encoding")] use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8, WINDOWS_1251}; -use pretty_assertions::assert_eq; +#[cfg(feature = "encoding")] use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event::*}; use quick_xml::reader::Reader; +#[cfg(feature = "encoding")] mod decode { use super::*; use pretty_assertions::assert_eq; @@ -20,33 +22,218 @@ mod decode { #[test] fn test_detect_encoding() { // No BOM - assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0))); + let detected = detect_encoding(UTF8_TEXT.as_bytes()).unwrap(); + assert_eq!(detected.encoding(), UTF_8); + assert_eq!(detected.bom_len(), 0); + // BOM - assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3))); - assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2))); - assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2))); + let detected = detect_encoding(UTF8_TEXT_WITH_BOM).unwrap(); + assert_eq!(detected.encoding(), UTF_8); + assert_eq!(detected.bom_len(), 3); + + let detected = detect_encoding(UTF16BE_TEXT_WITH_BOM).unwrap(); + assert_eq!(detected.encoding(), UTF_16BE); + assert_eq!(detected.bom_len(), 2); + + let detected = detect_encoding(UTF16LE_TEXT_WITH_BOM).unwrap(); + assert_eq!(detected.encoding(), UTF_16LE); + assert_eq!(detected.bom_len(), 2); + } + + #[test] + fn koi8_r_encoding() { + let src = include_bytes!("documents/opennews_all.rss").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader(src); + r.config_mut().trim_text(true); + loop { + match r.read_event_into(&mut buf) { + Ok(Text(e)) => { + e.xml10_content().unwrap(); + } + Ok(Eof) => break, + _ => (), + } + } } } -#[test] -fn test_koi8_r_encoding() { - let src = include_bytes!("documents/opennews_all.rss").as_ref(); - let mut buf = vec![]; - let mut r = Reader::from_reader(src); - r.config_mut().trim_text(true); - loop { - match r.read_event_into(&mut buf) { - Ok(Text(e)) => { - e.xml10_content().unwrap(); +#[cfg(not(feature = "encoding"))] +mod validate { + use super::*; + use quick_xml::encoding::{EncodingError, Utf8ValidationError}; + use quick_xml::errors::Error; + + #[test] + fn validation_fails_on_utf16le_input_with_bom() { + let src = include_bytes!("documents/encoding/utf16le-bom.xml").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader_validating(src); + r.config_mut().trim_text(true); + + let result = loop { + match r.read_event_into(&mut buf) { + Ok(_) => panic!("Expected encoding error, didn't get one"), + Err(e) => break e, + } + }; + + // Assert that we got the specific error type + match result { + Error::Encoding(EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected( + detected, + ))) => { + use quick_xml::encoding::DetectedEncoding; + assert!( + matches!(detected, DetectedEncoding::Utf16LeBom), + "Expected UTF-16 LE BOM detection, got: {:?}", + detected + ); } - Ok(Eof) => break, - _ => (), + other => panic!( + "Expected EncodingError::Utf8(NonUtf8EncodingDetected), got: {:?}", + other + ), + } + } + + #[test] + fn validation_fails_on_utf16be_input_with_bom() { + let src = include_bytes!("documents/encoding/utf16be-bom.xml").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader_validating(src); + r.config_mut().trim_text(true); + + let result = loop { + match r.read_event_into(&mut buf) { + Ok(_) => panic!("Expected encoding error, didn't get one"), + Err(e) => break e, + } + }; + + // Assert that we got the specific error type + match result { + Error::Encoding(EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected( + detected, + ))) => { + use quick_xml::encoding::DetectedEncoding; + assert!( + matches!(detected, DetectedEncoding::Utf16BeBom), + "Expected UTF-16 BE BOM detection, got: {:?}", + detected + ); + } + other => panic!( + "Expected EncodingError::Utf8(NonUtf8EncodingDetected), got: {:?}", + other + ), + } + } + + #[test] + fn validation_fails_on_utf16le_input_without_bom() { + let src = include_bytes!("documents/encoding/utf16le.xml").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader_validating(src); + r.config_mut().trim_text(true); + + let result = loop { + match r.read_event_into(&mut buf) { + Ok(_) => panic!("Expected encoding error, didn't get one"), + Err(e) => break e, + } + }; + + // Assert that we got the specific error type + match result { + Error::Encoding(EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected( + detected, + ))) => { + use quick_xml::encoding::DetectedEncoding; + assert!( + matches!(detected, DetectedEncoding::Utf16Le), + "Expected UTF-16 LE pattern detection, got: {:?}", + detected + ); + } + other => panic!( + "Expected EncodingError::Utf8(NonUtf8EncodingDetected), got: {:?}", + other + ), + } + } + + #[test] + fn validation_fails_on_utf16be_input_without_bom() { + let src = include_bytes!("documents/encoding/utf16be.xml").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader_validating(src); + r.config_mut().trim_text(true); + + let result = loop { + match r.read_event_into(&mut buf) { + Ok(_) => panic!("Expected encoding error, didn't get one"), + Err(e) => break e, + } + }; + + // Assert that we got the specific error type + match result { + Error::Encoding(EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected( + detected, + ))) => { + use quick_xml::encoding::DetectedEncoding; + assert!( + matches!(detected, DetectedEncoding::Utf16Be), + "Expected UTF-16 BE pattern detection, got: {:?}", + detected + ); + } + other => panic!( + "Expected EncodingError::Utf8(NonUtf8EncodingDetected), got: {:?}", + other + ), + } + } + + #[test] + #[ignore = "Validating Reader cannot yet assume that a non-UTF-8 encoding in the Decl = problem"] + fn validation_fails_on_koi9r_input() { + let src = include_bytes!("documents/encoding/KOI8-R.xml").as_ref(); + let mut buf = vec![]; + let mut r = Reader::from_reader_validating(src); + r.config_mut().trim_text(true); + + let result = loop { + match r.read_event_into(&mut buf) { + Ok(_) => panic!("Expected encoding error, didn't get one"), + Err(e) => break e, + } + }; + + // Assert that we got the specific error type + match result { + Error::Encoding(EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected( + detected, + ))) => { + use quick_xml::encoding::DetectedEncoding; + assert!( + matches!(detected, DetectedEncoding::Utf16Be), + "Expected UTF-16 BE pattern detection, got: {:?}", + detected + ); + } + other => panic!( + "Expected EncodingError::Utf8(NonUtf8EncodingDetected), got: {:?}", + other + ), } } } /// Test data generated by helper project `test-gen`, which requires checkout of /// an `encoding` submodule +#[cfg(feature = "encoding")] mod detect { use super::*; use encoding_rs::*; @@ -208,8 +395,57 @@ mod detect { check_detection!(windows_1258, WINDOWS_1258, "windows-1258"); check_detection!(x_mac_cyrillic, X_MAC_CYRILLIC, "x-mac-cyrillic"); check_detection!(x_user_defined, X_USER_DEFINED, "x-user-defined"); + + /// Checks that encoding is detected by BOM and changed after XML declaration + /// BOM indicates UTF-16LE, but XML - windows-1251 + #[cfg(feature = "encoding")] + #[test] + fn bom_overridden_by_declaration() { + let mut reader = Reader::from_reader(b"\xFF\xFE".as_ref()); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); + assert_eq!(reader.decoder().encoding(), WINDOWS_1251); + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + /// Checks that encoding is changed by XML declaration, but only once + #[cfg(feature = "encoding")] + #[test] + fn only_one_declaration_changes_encoding() { + let mut reader = Reader::from_reader( + b"".as_ref(), + ); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); + assert_eq!(reader.decoder().encoding(), UTF_16LE); + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + /// Checks that XML declaration cannot change the encoding from UTF-8 if + /// a `Reader` was created using `from_str` method + #[cfg(feature = "encoding")] + #[test] + fn str_always_has_utf8() { + let mut reader = Reader::from_str(""); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event().unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_8); + + assert_eq!(reader.read_event().unwrap(), Eof); + } } +#[cfg(feature = "encoding")] #[test] fn bom_removed_from_initial_text() { let mut r = @@ -227,47 +463,3 @@ fn bom_removed_from_initial_text() { assert_eq!(r.read_event().unwrap(), End(BytesEnd::new("paired"))); assert_eq!(r.read_event().unwrap(), Eof); } - -/// Checks that encoding is detected by BOM and changed after XML declaration -/// BOM indicates UTF-16LE, but XML - windows-1251 -#[test] -fn bom_overridden_by_declaration() { - let mut reader = Reader::from_reader(b"\xFF\xFE".as_ref()); - let mut buf = Vec::new(); - - assert_eq!(reader.decoder().encoding(), UTF_8); - assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); - assert_eq!(reader.decoder().encoding(), WINDOWS_1251); - - assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); -} - -/// Checks that encoding is changed by XML declaration, but only once -#[test] -fn only_one_declaration_changes_encoding() { - let mut reader = - Reader::from_reader(b"".as_ref()); - let mut buf = Vec::new(); - - assert_eq!(reader.decoder().encoding(), UTF_8); - assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - - assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Decl(_))); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - - assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); -} - -/// Checks that XML declaration cannot change the encoding from UTF-8 if -/// a `Reader` was created using `from_str` method -#[test] -fn str_always_has_utf8() { - let mut reader = Reader::from_str(""); - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event().unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_8); - - assert_eq!(reader.read_event().unwrap(), Eof); -}