From b3f52c3cefa6e21e7534bd6c90f9aafe1296fdd0 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 20 Jan 2026 00:21:49 +0500 Subject: [PATCH 1/4] Use correct method to get content of events in benchmarks and tests In all places XML documents has version 1.0, so explicitly request content in that version --- benches/macrobenches.rs | 8 ++++---- benches/microbenches.rs | 2 +- tests/encodings.rs | 2 +- tests/fuzzing.rs | 2 +- tests/reader.rs | 2 +- tests/roundtrip.rs | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 77197b3e..7e98628b 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -59,7 +59,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { } } Event::Text(e) => { - black_box(e.xml_content()?); + black_box(e.xml10_content()?); } Event::CData(e) => { black_box(e.into_inner()); @@ -84,7 +84,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { } } Event::Text(e) => { - black_box(e.xml_content()?); + black_box(e.xml10_content()?); } Event::CData(e) => { black_box(e.into_inner()); @@ -110,7 +110,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - black_box(e.xml_content()?); + black_box(e.xml10_content()?); black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { @@ -138,7 +138,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - black_box(e.xml_content()?); + black_box(e.xml10_content()?); black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index f6af550f..97e67e2b 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -150,7 +150,7 @@ fn one_event(c: &mut Criterion) { config.trim_text(true); config.check_end_names = false; match r.read_event() { - Ok(Event::Comment(e)) => nbtxt += e.xml_content().unwrap().len(), + Ok(Event::Comment(e)) => nbtxt += e.xml10_content().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/tests/encodings.rs b/tests/encodings.rs index 30f132b3..b906f3b6 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -37,7 +37,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event_into(&mut buf) { Ok(Text(e)) => { - e.xml_content().unwrap(); + e.xml10_content().unwrap(); } Ok(Eof) => break, _ => (), diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index 5d9ba047..90c72f18 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -38,7 +38,7 @@ fn fuzz_101() { } } Ok(Event::Text(e)) => { - if e.xml_content().is_err() { + if e.xml10_content().is_err() { break; } } diff --git a/tests/reader.rs b/tests/reader.rs index 15ce8eaf..41897f3d 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -172,7 +172,7 @@ fn test_escaped_content() { "content unexpected: expecting 'test', got '{:?}'", from_utf8(&e) ); - match e.xml_content() { + match e.xml10_content() { Ok(c) => assert_eq!(c, "test"), Err(e) => panic!( "cannot escape content at position {}: {:?}", diff --git a/tests/roundtrip.rs b/tests/roundtrip.rs index c99e6f64..b72e7d4d 100644 --- a/tests/roundtrip.rs +++ b/tests/roundtrip.rs @@ -236,7 +236,7 @@ fn reescape_text() { match reader.read_event().unwrap() { Eof => break, Text(e) => { - let t = e.xml_content().unwrap(); + let t = e.xml10_content().unwrap(); assert!(writer.write_event(Text(BytesText::new(&t))).is_ok()); } e => assert!(writer.write_event(e).is_ok()), From f12a6525322606ba00355c688238d7110f58b44d Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 20 Jan 2026 00:00:08 +0500 Subject: [PATCH 2/4] Add XmlVersion enum and xml_version method to the BytesDecl --- Changelog.md | 7 ++++- src/errors.rs | 5 ++++ src/events/mod.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 19 ++++++++++++++ 4 files changed, 96 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index 63110b82..1c974ae5 100644 --- a/Changelog.md +++ b/Changelog.md @@ -16,10 +16,15 @@ ### New Features +- [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`. +- [#938]: Add new error variant `IllFormedError::UnknownVersion`. + ### Bug Fixes ### Misc Changes +[#938]: https://github.com/tafia/quick-xml/pull/938 + ## 0.39.2 -- 2026-02-20 @@ -41,7 +46,7 @@ ### New Features -- [#598]: Add method `NamespaceResolver::set_level` which may be helpful in som circumstances. +- [#598]: Add method `NamespaceResolver::set_level` which may be helpful in some circumstances. ### Bug Fixes diff --git a/src/errors.rs b/src/errors.rs index 9a529ca6..9002f047 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -103,6 +103,8 @@ pub enum IllFormedError { /// /// [specification]: https://www.w3.org/TR/xml11/#sec-prolog-dtd MissingDeclVersion(Option), + /// XML version specified in the declaration neither 1.0 or 1.1. + UnknownVersion, /// A document type definition (DTD) does not contain a name of a root element. /// /// According to the [specification], document type definition (``) @@ -152,6 +154,9 @@ impl fmt::Display for IllFormedError { Self::MissingDeclVersion(Some(attr)) => { write!(f, "an XML declaration must start with `version` attribute, but in starts with `{}`", attr) } + Self::UnknownVersion => { + f.write_str("unknown XML version: either 1.0 or 1.1 is expected") + } Self::MissingDoctypeName => { f.write_str("`` declaration does not contain a name of a document type") } diff --git a/src/events/mod.rs b/src/events/mod.rs index 2953e412..2e6fddc3 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -54,6 +54,7 @@ use crate::escape::{ }; use crate::name::{LocalName, QName}; use crate::utils::{self, name_len, trim_xml_end, trim_xml_start, write_cow_string}; +use crate::XmlVersion; use attributes::{AttrError, Attribute, Attributes}; /// Opening tag data (`Event::Start`), with optional attributes: ``. @@ -1432,6 +1433,71 @@ impl<'a> BytesDecl<'a> { .transpose() } + /// Gets XML version as typified enumeration. + /// + /// According to the [grammar], the version *must* be the first thing in the declaration. + /// This method tries to extract the first thing in the declaration and return it. + /// In case of multiple attributes value of the first one is returned. + /// + /// If version is missed in the declaration, or the first thing is not a version, + /// [`IllFormedError::MissingDeclVersion`] will be returned. + /// + /// If version is not 1.0 or 1.1, [`IllFormedError::UnknownVersion`] will be returned. + /// + /// # Examples + /// + /// ``` + /// use quick_xml::XmlVersion; + /// use quick_xml::errors::{Error, IllFormedError}; + /// use quick_xml::events::{BytesDecl, BytesStart}; + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content(" version='1.1'", 0)); + /// assert_eq!(decl.xml_version().unwrap(), XmlVersion::V1_1); + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content(" version='1.0' version='1.1'", 0)); + /// assert_eq!(decl.xml_version().unwrap(), XmlVersion::V1_0); + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content(" version='1.2'", 0)); + /// match decl.xml_version() { + /// Err(Error::IllFormed(IllFormedError::UnknownVersion)) => {}, + /// _ => assert!(false), + /// } + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content(" encoding='utf-8'", 0)); + /// match decl.xml_version() { + /// Err(Error::IllFormed(IllFormedError::MissingDeclVersion(Some(key)))) => assert_eq!(key, "encoding"), + /// _ => assert!(false), + /// } + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content(" encoding='utf-8' version='1.1'", 0)); + /// match decl.xml_version() { + /// Err(Error::IllFormed(IllFormedError::MissingDeclVersion(Some(key)))) => assert_eq!(key, "encoding"), + /// _ => assert!(false), + /// } + /// + /// // + /// let decl = BytesDecl::from_start(BytesStart::from_content("", 0)); + /// match decl.xml_version() { + /// Err(Error::IllFormed(IllFormedError::MissingDeclVersion(None))) => {}, + /// _ => assert!(false), + /// } + /// ``` + /// + /// [grammar]: https://www.w3.org/TR/xml11/#NT-XMLDecl + pub fn xml_version(&self) -> Result { + let v = self.version()?; + match v.as_ref() { + b"1.0" => Ok(XmlVersion::V1_0), + b"1.1" => Ok(XmlVersion::V1_1), + _ => Err(Error::IllFormed(IllFormedError::UnknownVersion)), + } + } + /// Gets the actual encoding using [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get) /// algorithm. /// diff --git a/src/lib.rs b/src/lib.rs index 068c2b5e..46d69da1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -78,3 +78,22 @@ pub use crate::errors::serialize::{DeError, SeError}; pub use crate::errors::{Error, Result}; pub use crate::reader::{NsReader, Reader}; pub use crate::writer::{ElementWriter, Writer}; + +/// Version of XML standard +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum XmlVersion { + /// [Version 1.0], which is the default version of XML document if XML declaration + /// is missed. Most documents in the world are still XML 1.0 documents. + /// + /// [Version 1.0]: https://www.w3.org/TR/xml/ + V1_0, + /// [Version 1.1](https://www.w3.org/TR/xml11/) + V1_1, +} + +impl Default for XmlVersion { + #[inline] + fn default() -> Self { + Self::V1_0 + } +} From 7528d46a8f4b3d7b6b04fb08b01defdc99e4e799 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 15 Feb 2026 16:52:11 +0500 Subject: [PATCH 3/4] Add tests for parsing EOLs in different XML versions failures: xml_version::unknown xml_version::v1_0_explicit xml_version::v1_0_implicit --- tests/serde-de.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/serde-de.rs b/tests/serde-de.rs index 98cbf2b3..ac35ff63 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1968,3 +1968,80 @@ mod xml_prolog { ); } } + +/// Tests for https://github.com/tafia/quick-xml/pull/937. +/// +/// Checks that correct EOL normalization rules is applied to the texts. +mod xml_version { + use super::*; + use pretty_assertions::assert_eq; + use quick_xml::errors::{Error, IllFormedError}; + + #[derive(Debug, Deserialize, PartialEq)] + struct Root { + #[serde(rename = "$text")] + text: String, + } + + #[test] + fn v1_0_implicit() { + assert_eq!( + from_str::( + "\ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + " + ) + .unwrap(), + Root { + text: "\n,\n,\n,\n\u{0085},\u{0085},\u{2028}".to_string(), + } + ); + } + + #[test] + fn v1_0_explicit() { + assert_eq!( + from_str::( + "\ + \ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + " + ) + .unwrap(), + Root { + text: "\n,\n,\n,\n\u{0085},\u{0085},\u{2028}".to_string(), + } + ); + } + + #[test] + fn v1_1() { + assert_eq!( + from_str::( + "\ + \ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + " + ) + .unwrap(), + Root { + text: "\n,\n,\n,\n,\n,\n".to_string(), + } + ); + } + + #[test] + fn unknown() { + match from_str::( + "\ + \ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + ", + ) { + Err(DeError::InvalidXml(Error::IllFormed(cause))) => { + assert_eq!(cause, IllFormedError::UnknownVersion,) + } + x => panic!("Expected `Err(InvalidXml(IllFormed(_)))`, but got {:?}", x), + } + } +} From a759d652b8406648d8fd5e5f795916360bf85924 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 20 Jan 2026 00:34:20 +0500 Subject: [PATCH 4/4] Consider XML version when parse XML using Deserializer Fixes all errors --- Changelog.md | 6 ++++++ src/de/mod.rs | 51 ++++++++++++++++++++++++++++++++++++++++++----- src/events/mod.rs | 21 +++++++++++++------ 3 files changed, 67 insertions(+), 11 deletions(-) diff --git a/Changelog.md b/Changelog.md index 1c974ae5..452cb84f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -21,8 +21,14 @@ ### Bug Fixes +- [#938]: Use correct rules for EOL normalization in `Deserializer` when parse XML 1.0 documents. + Previously XML 1.1. rules was applied. + ### Misc Changes +- [#938]: Now `BytesText::xml_content`, `BytesCData::xml_content` and `BytesRef::xml_content` + accepts `XmlVersion` parameter to apply correct EOL normalization rules. + [#938]: https://github.com/tafia/quick-xml/pull/938 diff --git a/src/de/mod.rs b/src/de/mod.rs index d2a8bf16..de2206a7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2107,6 +2107,7 @@ pub use self::attributes::AttributesDeserializer; pub use self::resolver::{EntityResolver, PredefinedEntityResolver}; pub use self::simple_type::SimpleTypeDeserializer; pub use crate::errors::serialize::DeError; +use crate::XmlVersion; use crate::{ de::map::ElementMapAccess, @@ -2391,8 +2392,12 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { } match self.next_impl()? { - PayloadEvent::Text(e) => result.to_mut().push_str(&e.xml_content()?), - PayloadEvent::CData(e) => result.to_mut().push_str(&e.xml_content()?), + PayloadEvent::Text(e) => result + .to_mut() + .push_str(&e.xml_content(self.reader.xml_version())?), + PayloadEvent::CData(e) => result + .to_mut() + .push_str(&e.xml_content(self.reader.xml_version())?), PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef @@ -2408,8 +2413,10 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { return match self.next_impl()? { PayloadEvent::Start(e) => Ok(DeEvent::Start(e)), PayloadEvent::End(e) => Ok(DeEvent::End(e)), - PayloadEvent::Text(e) => self.drain_text(e.xml_content()?), - PayloadEvent::CData(e) => self.drain_text(e.xml_content()?), + PayloadEvent::Text(e) => self.drain_text(e.xml_content(self.reader.xml_version())?), + PayloadEvent::CData(e) => { + self.drain_text(e.xml_content(self.reader.xml_version())?) + } PayloadEvent::DocType(e) => { self.entity_resolver .capture(e) @@ -3068,7 +3075,13 @@ where let config = reader.config_mut(); config.expand_empty_elements = true; - Self::new(SliceReader { reader }, entity_resolver) + Self::new( + SliceReader { + reader, + version: XmlVersion::V1_0, + }, + entity_resolver, + ) } } @@ -3148,6 +3161,7 @@ where IoReader { reader, buf: Vec::new(), + version: XmlVersion::V1_0, }, entity_resolver, ) @@ -3167,6 +3181,7 @@ where IoReader { reader, buf: Vec::new(), + version: XmlVersion::V1_0, }, entity_resolver, ) @@ -3391,6 +3406,9 @@ pub trait XmlRead<'i> { /// when it cannot satisfy the lifetime. fn read_to_end(&mut self, name: QName) -> Result<(), DeError>; + /// Return an XML version of the source. + fn xml_version(&self) -> XmlVersion; + /// A copy of the reader's decoder used to decode strings. fn decoder(&self) -> Decoder; @@ -3408,6 +3426,7 @@ pub trait XmlRead<'i> { pub struct IoReader { reader: NsReader, buf: Vec, + version: XmlVersion, } impl IoReader { @@ -3451,6 +3470,9 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { self.buf.clear(); let event = self.reader.read_event_into(&mut self.buf)?; + if let Event::Decl(e) = &event { + self.version = e.xml_version()?; + } if let Some(event) = skip_uninterested(event) { return Ok(event.into_owned()); } @@ -3464,6 +3486,12 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { } } + #[inline] + fn xml_version(&self) -> XmlVersion { + self.version + } + + #[inline] fn decoder(&self) -> Decoder { self.reader.decoder() } @@ -3479,6 +3507,7 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { /// [`Deserializer::from_str`]. pub struct SliceReader<'de> { reader: NsReader<&'de [u8]>, + version: XmlVersion, } impl<'de> SliceReader<'de> { @@ -3519,6 +3548,9 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { fn next(&mut self) -> Result, DeError> { loop { let event = self.reader.read_event()?; + if let Event::Decl(e) = &event { + self.version = e.xml_version()?; + } if let Some(event) = skip_uninterested(event) { return Ok(event); } @@ -3532,6 +3564,12 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { } } + #[inline] + fn xml_version(&self) -> XmlVersion { + self.version + } + + #[inline] fn decoder(&self) -> Decoder { self.reader.decoder() } @@ -4123,9 +4161,11 @@ mod tests { let mut reader1 = IoReader { reader: NsReader::from_reader(s.as_bytes()), buf: Vec::new(), + version: XmlVersion::V1_0, }; let mut reader2 = SliceReader { reader: NsReader::from_str(s), + version: XmlVersion::V1_0, }; loop { @@ -4151,6 +4191,7 @@ mod tests { let mut reader = SliceReader { reader: NsReader::from_str(s), + version: XmlVersion::V1_0, }; let config = reader.reader.config_mut(); diff --git a/src/events/mod.rs b/src/events/mod.rs index 2e6fddc3..217cddc4 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -660,8 +660,11 @@ impl<'a> BytesText<'a> { /// Alias for [`xml11_content()`](Self::xml11_content). #[inline] - pub fn xml_content(&self) -> Result, EncodingError> { - self.xml11_content() + pub fn xml_content(&self, version: XmlVersion) -> Result, EncodingError> { + match version { + XmlVersion::V1_0 => self.xml10_content(), + XmlVersion::V1_1 => self.xml11_content(), + } } /// Alias for [`xml10_content()`](Self::xml10_content). @@ -968,8 +971,11 @@ impl<'a> BytesCData<'a> { /// Alias for [`xml11_content()`](Self::xml11_content). #[inline] - pub fn xml_content(&self) -> Result, EncodingError> { - self.xml11_content() + pub fn xml_content(&self, version: XmlVersion) -> Result, EncodingError> { + match version { + XmlVersion::V1_0 => self.xml10_content(), + XmlVersion::V1_1 => self.xml11_content(), + } } /// Alias for [`xml10_content()`](Self::xml10_content). @@ -1680,8 +1686,11 @@ impl<'a> BytesRef<'a> { /// Alias for [`xml11_content()`](Self::xml11_content). #[inline] - pub fn xml_content(&self) -> Result, EncodingError> { - self.xml11_content() + pub fn xml_content(&self, version: XmlVersion) -> Result, EncodingError> { + match version { + XmlVersion::V1_0 => self.xml10_content(), + XmlVersion::V1_1 => self.xml11_content(), + } } /// Alias for [`xml10_content()`](Self::xml10_content).