diff --git a/Changelog.md b/Changelog.md index f77344d20..c4ae643be 100644 --- a/Changelog.md +++ b/Changelog.md @@ -18,6 +18,20 @@ - [#938]: Add new enumeration `XmlVersion` and typified getter `BytesDecl::xml_version()`. - [#938]: Add new error variant `IllFormedError::UnknownVersion`. +- [#371]: Add new error variant `EscapeError::TooManyNestedEntities`. +- [#371]: Improved compliance with the XML attribute value normalization process by adding + - `Attribute::normalized_value()` + - `Attribute::normalized_value_with()` + - `Attribute::decoded_and_normalized_value()` + - `Attribute::decoded_and_normalized_value_with()` + + which ought to be used in place of deprecated + - `Attribute::unescape_value()` + - `Attribute::unescape_value_with()` + - `Attribute::decode_and_unescape_value()` + - `Attribute::decode_and_unescape_value_with()` + + Deprecated functions now behaves the same as newly added. ### Bug Fixes @@ -31,6 +45,7 @@ - [#938]: Now `BytesText::xml_content`, `BytesCData::xml_content` and `BytesRef::xml_content` accepts `XmlVersion` parameter to apply correct EOL normalization rules. +[#371]: https://github.com/tafia/quick-xml/issues/371 [#914]: https://github.com/tafia/quick-xml/pull/914 [#938]: https://github.com/tafia/quick-xml/pull/938 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 7e98628b9..6955e6ad1 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -5,7 +5,7 @@ use criterion::{self, criterion_group, criterion_main, Criterion, Throughput}; use quick_xml::events::Event; use quick_xml::reader::{NsReader, Reader}; -use quick_xml::Result as XmlResult; +use quick_xml::{Result as XmlResult, XmlVersion}; use std::hint::black_box; static RPM_PRIMARY: &str = include_str!("../tests/documents/rpm_primary.xml"); @@ -48,14 +48,17 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); + let mut version = XmlVersion::V1_0; loop { match black_box(r.read_event()?) { + Event::Decl(e) => { + version = e.xml_version()?; + } Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(version, r.decoder())?); } } Event::Text(e) => { @@ -72,15 +75,18 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); + let mut version = XmlVersion::V1_0; let mut buf = Vec::new(); loop { match black_box(r.read_event_into(&mut buf)?) { + Event::Decl(e) => { + version = e.xml_version()?; + } Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(version, r.decoder())?); } } Event::Text(e) => { @@ -98,15 +104,18 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { let mut r = NsReader::from_str(doc); + let mut version = XmlVersion::V1_0; loop { match black_box(r.read_resolved_event()?) { + (_, Event::Decl(e)) => { + version = e.xml_version()?; + } (resolved_ns, Event::Start(e) | Event::Empty(e)) => { black_box(resolved_ns); for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(version, r.decoder())?); } } (resolved_ns, Event::Text(e)) => { @@ -125,16 +134,19 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { let mut r = NsReader::from_reader(doc); + let mut version = XmlVersion::V1_0; let mut buf = Vec::new(); loop { match black_box(r.read_resolved_event_into(&mut buf)?) { + (_, Event::Decl(e)) => { + version = e.xml_version()?; + } (resolved_ns, Event::Start(e) | Event::Empty(e)) => { black_box(resolved_ns); for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(version, r.decoder())?); } } (resolved_ns, Event::Text(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 97e67e2b7..bb2e96491 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -247,6 +247,50 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + b.iter(|| { + black_box(unescape("foobar")).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + b.iter(|| { + black_box(unescape("just a bit of text without any entities")).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + b.iter(|| { + black_box(unescape("just a bit\n of text without\tany entities")).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + b.iter(|| { + let text = "prefix "some stuff","more stuff""; + black_box(unescape(text)).unwrap(); + let text = "&<"; + black_box(unescape(text)).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + b.iter(|| { + let text = "age > 72 && age < 21"; + black_box(unescape(text)).unwrap(); + let text = ""what's that?""; + black_box(unescape(text)).unwrap(); + }) + }); + group.finish(); } @@ -359,6 +403,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index be9d11ea5..ed8c082a2 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -4,6 +4,7 @@ //! - decode attribute values //! //! NB: this example is deliberately kept simple: +//! * it assumes only XML 1.0 dialect (most widely used in the world) //! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data) //! * it only handles internal entities; //! * the regex in this example is simple but brittle; @@ -19,6 +20,7 @@ use quick_xml::escape::EscapeError; use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; use quick_xml::name::QName; use quick_xml::reader::Reader; +use quick_xml::XmlVersion; use regex::bytes::Regex; use pretty_assertions::assert_eq; @@ -154,7 +156,12 @@ fn main() -> Result<(), Box> { let label = attrs.next().unwrap()?; assert_eq!(label.key, QName(b"label")); assert_eq!( - label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + label.decoded_and_normalized_value_with( + XmlVersion::V1_0, + reader.decoder(), + 9, + |e| reader.get_entity(e) + )?, "Message: hello world" ); @@ -185,7 +192,9 @@ fn main() -> Result<(), Box> { let attr = attrs.next().unwrap()?; assert_eq!(attr.key, QName(b"attr")); assert_eq!( - attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + attr.decoded_and_normalized_value_with(XmlVersion::V1_0, reader.decoder(), 9, |e| { + reader.get_entity(e) + })?, "Message: hello world" ); diff --git a/examples/read_nodes.rs b/examples/read_nodes.rs index 50a5f90d8..470632105 100644 --- a/examples/read_nodes.rs +++ b/examples/read_nodes.rs @@ -6,6 +6,7 @@ use quick_xml::events::attributes::AttrError; use quick_xml::events::{BytesStart, Event}; use quick_xml::name::QName; use quick_xml::reader::Reader; +use quick_xml::XmlVersion; use std::borrow::Cow; use std::collections::HashMap; use std::convert::Infallible; @@ -70,8 +71,12 @@ impl Translation { for attr_result in element.attributes() { let a = attr_result?; match a.key.as_ref() { - b"Language" => lang = a.decode_and_unescape_value(reader.decoder())?, - b"Tag" => tag = a.decode_and_unescape_value(reader.decoder())?, + b"Language" => { + lang = a.decoded_and_normalized_value(XmlVersion::V1_0, reader.decoder())? + } + b"Tag" => { + tag = a.decoded_and_normalized_value(XmlVersion::V1_0, reader.decoder())? + } _ => (), } } @@ -141,7 +146,7 @@ fn main() -> Result<(), AppError> { Ok::, Infallible>(std::borrow::Cow::from("")) }) .unwrap().to_string(); - let value = a.decode_and_unescape_value(reader.decoder()).or_else(|err| { + let value = a.decoded_and_normalized_value(XmlVersion::V1_0, reader.decoder()).or_else(|err| { dbg!("unable to read key in DefaultSettings attribute {:?}, utf8 error {:?}", &a, err); Ok::, Infallible>(std::borrow::Cow::from("")) }).unwrap().to_string(); diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index dbadfe2f2..30bdcffe4 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -2,7 +2,7 @@ use libfuzzer_sys::fuzz_target; use std::hint::black_box; -use quick_xml::{events::Event, reader::Reader, writer::Writer}; +use quick_xml::{events::Event, reader::Reader, writer::Writer, XmlVersion}; use std::io::Cursor; macro_rules! debug_format { @@ -15,6 +15,7 @@ fn round_trip(reader: &mut Reader) -> () where R: std::io::BufRead, { + let mut version = XmlVersion::V1_0; let mut writer = Writer::new(Cursor::new(Vec::new())); let mut buf = vec![]; let config = reader.config_mut(); @@ -34,14 +35,14 @@ where debug_format!(e.name()); for a in e.attributes() { debug_format!(a); - if a.ok().map_or(false, |a| a.unescape_value().is_err()) { + if a.ok() + .map_or(false, |a| a.normalized_value(version).is_err()) + { break; } } } - Ok(Event::Text(ref e)) - | Ok(Event::Comment(ref e)) - | Ok(Event::DocType(ref e)) => { + Ok(Event::Text(ref e)) | Ok(Event::Comment(ref e)) | Ok(Event::DocType(ref e)) => { debug_format!(e); if let Err(err) = e.decode() { debug_format!(err); @@ -68,6 +69,10 @@ where let _ = black_box(e.version()); let _ = black_box(e.encoding()); let _ = black_box(e.standalone()); + match e.xml_version() { + Ok(v) => version = v, + Err(_) => break, + } } Ok(Event::End(e)) => { debug_format!(e.local_name()); diff --git a/src/de/attributes.rs b/src/de/attributes.rs index 8708885c1..778bb0531 100644 --- a/src/de/attributes.rs +++ b/src/de/attributes.rs @@ -2,13 +2,14 @@ use std::borrow::Cow; -use serde::de::{DeserializeSeed, Deserializer, Error, IntoDeserializer, MapAccess, Visitor}; +use serde::de::{DeserializeSeed, Deserializer, Error, MapAccess, Visitor}; use serde::forward_to_deserialize_any; use crate::de::key::QNameDeserializer; use crate::de::SimpleTypeDeserializer; use crate::errors::serialize::DeError; use crate::events::attributes::Attributes; +use crate::XmlVersion; impl<'i> Attributes<'i> { /// Converts this iterator into a serde's [`MapAccess`] trait to use with serde. @@ -17,15 +18,15 @@ impl<'i> Attributes<'i> { /// # Parameters /// - `prefix`: a prefix of the field names in structs that should be stripped /// to get the local attribute name. The [`crate::de::Deserializer`] uses `"@"` - /// as a prefix, but [`Self::into_deserializer()`] uses empy string, which mean + /// as a prefix, but [`Self::into_deserializer()`] uses empty string, which mean /// that we do not strip anything. /// /// # Example /// ``` /// # use pretty_assertions::assert_eq; /// use quick_xml::events::BytesStart; + /// use quick_xml::XmlVersion; /// use serde::Deserialize; - /// use serde::de::IntoDeserializer; /// /// #[derive(Debug, PartialEq, Deserialize)] /// struct MyData<'i> { @@ -46,7 +47,7 @@ impl<'i> Attributes<'i> { /// 3 /// ); /// // Strip nothing from the field names - /// let de = tag.attributes().clone().into_deserializer(); + /// let de = tag.attributes().clone().into_map_access(XmlVersion::V1_0, ""); /// assert_eq!( /// MyData::deserialize(de).unwrap(), /// MyData { @@ -56,7 +57,7 @@ impl<'i> Attributes<'i> { /// ); /// /// // Strip "@" from the field name - /// let de = tag.attributes().into_map_access("@"); + /// let de = tag.attributes().into_map_access(XmlVersion::V1_0, "@"); /// assert_eq!( /// MyDataPrefixed::deserialize(de).unwrap(), /// MyDataPrefixed { @@ -66,25 +67,21 @@ impl<'i> Attributes<'i> { /// ); /// ``` #[inline] - pub const fn into_map_access(self, prefix: &'static str) -> AttributesDeserializer<'i> { + pub const fn into_map_access( + self, + version: XmlVersion, + prefix: &'static str, + ) -> AttributesDeserializer<'i> { AttributesDeserializer { iter: self, value: None, prefix, key_buf: String::new(), + version, } } } -impl<'de> IntoDeserializer<'de, DeError> for Attributes<'de> { - type Deserializer = AttributesDeserializer<'de>; - - #[inline] - fn into_deserializer(self) -> Self::Deserializer { - self.into_map_access("") - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// /// A deserializer used to make possible to pack all attributes into a struct. @@ -106,8 +103,9 @@ pub struct AttributesDeserializer<'i> { /// This prefix will be stripped from struct fields before match against attribute name. prefix: &'static str, /// Buffer to store attribute name as a field name exposed to serde consumers. - /// Keeped in the serializer to avoid many small allocations + /// Kept in the deserializer to avoid many small allocations key_buf: String, + version: XmlVersion, } impl<'de> Deserializer<'de> for AttributesDeserializer<'de> { @@ -157,8 +155,12 @@ impl<'de> MapAccess<'de> for AttributesDeserializer<'de> { { match self.value.take() { Some(value) => { - let de = - SimpleTypeDeserializer::from_part(&value, 0..value.len(), self.iter.decoder()); + let de = SimpleTypeDeserializer::from_attr( + &value, + 0..value.len(), + self.version, + self.iter.decoder(), + ); seed.deserialize(de) } None => Err(DeError::KeyNotRead), diff --git a/src/de/map.rs b/src/de/map.rs index cfa0bca89..3d25a9411 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -342,9 +342,10 @@ where seed: K, ) -> Result { match std::mem::replace(&mut self.source, ValueSource::Unknown) { - ValueSource::Attribute(value) => seed.deserialize(SimpleTypeDeserializer::from_part( + ValueSource::Attribute(value) => seed.deserialize(SimpleTypeDeserializer::from_attr( &self.start.buf, value, + self.de.reader.reader.xml_version(), self.start.decoder(), )), // This arm processes the following XML shape: diff --git a/src/de/simple_type.rs b/src/de/simple_type.rs index 5e615f144..80f68ecf0 100644 --- a/src/de/simple_type.rs +++ b/src/de/simple_type.rs @@ -6,8 +6,9 @@ use crate::de::Text; use crate::encoding::Decoder; use crate::errors::serialize::DeError; -use crate::escape::unescape; +use crate::escape::resolve_predefined_entity; use crate::utils::{trim_xml_spaces, CowRef}; +use crate::XmlVersion; use memchr::memchr; use serde::de::value::UnitDeserializer; use serde::de::{ @@ -39,8 +40,7 @@ macro_rules! deserialize_primitive { V: Visitor<'de>, { let de = AtomicDeserializer { - content: self.decode()?, - escaped: self.escaped, + content: self.content()?, }; de.$method(visitor) } @@ -123,8 +123,6 @@ impl<'de, 'a> Content<'de, 'a> { struct AtomicDeserializer<'de, 'a> { /// Content of the attribute value, text content or CDATA content content: CowRef<'de, 'a, str>, - /// If `true`, `content` in an escaped form and should be unescaped before use - escaped: bool, } impl<'de, 'a> Deserializer<'de> for AtomicDeserializer<'de, 'a> { @@ -145,19 +143,11 @@ impl<'de, 'a> Deserializer<'de> for AtomicDeserializer<'de, 'a> { where V: Visitor<'de>, { - let text = self.content.as_ref(); - let text = if self.escaped { - unescape(text)? - } else { - Cow::Borrowed(text) - }; - match trim_xml_spaces(&text) { + let text: &str = self.content.as_ref(); + match trim_xml_spaces(text) { "1" | "true" => visitor.visit_bool(true), "0" | "false" => visitor.visit_bool(false), - _ => match text { - Cow::Borrowed(_) => self.content.deserialize_str(visitor), - Cow::Owned(s) => visitor.visit_string(s), - }, + _ => self.content.deserialize_str(visitor), } } @@ -183,20 +173,12 @@ impl<'de, 'a> Deserializer<'de> for AtomicDeserializer<'de, 'a> { V: Visitor<'de>, { let text: &str = self.content.as_ref(); - let text = if self.escaped { - unescape(text)? - } else { - Cow::Borrowed(text) - }; - let trimmed = trim_xml_spaces(&text); + let trimmed = trim_xml_spaces(text); // If string is empty or contains only XML space characters (probably only one), // deserialize as usual string and allow visitor to accept or reject it. // Otherwise trim spaces and allow visitor to accept or reject the rest. if trimmed.is_empty() { - match text { - Cow::Borrowed(_) => self.content.deserialize_str(visitor), - Cow::Owned(s) => visitor.visit_string(s), - } + self.content.deserialize_str(visitor) } else { visitor.visit_str(trimmed) } @@ -216,14 +198,7 @@ impl<'de, 'a> Deserializer<'de> for AtomicDeserializer<'de, 'a> { where V: Visitor<'de>, { - if self.escaped { - match unescape(self.content.as_ref())? { - Cow::Borrowed(_) => self.content.deserialize_str(visitor), - Cow::Owned(s) => visitor.visit_string(s), - } - } else { - self.content.deserialize_str(visitor) - } + self.content.deserialize_str(visitor) } fn deserialize_string(self, visitor: V) -> Result @@ -377,8 +352,6 @@ impl<'de> VariantAccess<'de> for UnitOnly { struct ListIter<'de, 'a> { /// If `Some`, contains unconsumed data of the list content: Option>, - /// If `true`, `content` in escaped form and should be unescaped before use - escaped: bool, } impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { type Error = DeError; @@ -400,19 +373,15 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { None => match content { Content::Input(s) => seed.deserialize(AtomicDeserializer { content: CowRef::Input(s), - escaped: self.escaped, }), Content::Slice(s) => seed.deserialize(AtomicDeserializer { content: CowRef::Slice(s), - escaped: self.escaped, }), Content::Owned(s, 0) => seed.deserialize(AtomicDeserializer { content: CowRef::Owned(s), - escaped: self.escaped, }), Content::Owned(s, offset) => seed.deserialize(AtomicDeserializer { content: CowRef::Slice(s.split_at(offset).1), - escaped: self.escaped, }), }, // `content` started with a space, skip them all @@ -442,7 +411,6 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { seed.deserialize(AtomicDeserializer { content: CowRef::Input(item), - escaped: self.escaped, }) } Content::Slice(s) => { @@ -451,7 +419,6 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { seed.deserialize(AtomicDeserializer { content: CowRef::Slice(item), - escaped: self.escaped, }) } // Skip additional bytes if we own data for next iteration, but deserialize from @@ -461,7 +428,6 @@ impl<'de, 'a> SeqAccess<'de> for ListIter<'de, 'a> { let item = rest.split_at(end).0; let result = seed.deserialize(AtomicDeserializer { content: CowRef::Slice(item), - escaped: self.escaped, }); self.content = Some(Content::Owned(s, skip + end)); @@ -534,10 +500,11 @@ pub struct SimpleTypeDeserializer<'de, 'a> { /// - In case of text contains unescaped text value content: CowRef<'de, 'a, [u8]>, /// If `true`, `content` in escaped form and should be unescaped before use - escaped: bool, + is_attr: bool, /// Decoder used to deserialize string data, numeric and boolean data. /// Not used for deserializing raw byte buffers decoder: Decoder, + version: XmlVersion, } impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { @@ -549,7 +516,7 @@ impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { Cow::Borrowed(slice) => CowRef::Input(slice.as_bytes()), Cow::Owned(content) => CowRef::Owned(content.into_bytes()), }; - Self::new(content, false, Decoder::utf8()) + Self::new(content, false, XmlVersion::V1_0, Decoder::utf8()) } /// Creates a deserializer from an XML text node, that possible borrowed from input. /// @@ -564,25 +531,32 @@ impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { /// /// This constructor used internally to deserialize from attribute values. #[allow(clippy::ptr_arg)] - pub(crate) fn from_part( + pub(crate) fn from_attr( value: &'a Cow<'de, [u8]>, range: Range, + version: XmlVersion, decoder: Decoder, ) -> Self { let content = match value { Cow::Borrowed(slice) => CowRef::Input(&slice[range]), Cow::Owned(slice) => CowRef::Slice(&slice[range]), }; - Self::new(content, true, decoder) + Self::new(content, true, version, decoder) } /// Constructor for tests #[inline] - const fn new(content: CowRef<'de, 'a, [u8]>, escaped: bool, decoder: Decoder) -> Self { + const fn new( + content: CowRef<'de, 'a, [u8]>, + is_attr: bool, + version: XmlVersion, + decoder: Decoder, + ) -> Self { Self { content, - escaped, + is_attr, decoder, + version, } } @@ -605,6 +579,20 @@ impl<'de, 'a> SimpleTypeDeserializer<'de, 'a> { }, }) } + + fn content<'b>(&'b self) -> Result, DeError> { + let content = self.decode()?; + if self.is_attr { + let value = + self.version + .normalize_attribute_value(&content, 128, resolve_predefined_entity)?; + return Ok(match value { + Cow::Borrowed(_) => content, + Cow::Owned(value) => CowRef::Owned(value), + }); + } + Ok(content) + } } impl<'de, 'a> Deserializer<'de> for SimpleTypeDeserializer<'de, 'a> { @@ -685,14 +673,13 @@ impl<'de, 'a> Deserializer<'de> for SimpleTypeDeserializer<'de, 'a> { where V: Visitor<'de>, { - let content = match self.decode()? { + let content = match self.content()? { CowRef::Input(s) => Content::Input(s), CowRef::Slice(s) => Content::Slice(s), CowRef::Owned(s) => Content::Owned(s, 0), }; visitor.visit_seq(ListIter { content: Some(content), - escaped: self.escaped, }) } @@ -792,7 +779,12 @@ mod tests { fn $name() { let decoder = Decoder::$encoding(); let xml = $xml; - let de = SimpleTypeDeserializer::new(CowRef::Input(xml.as_ref()), true, decoder); + let de = SimpleTypeDeserializer::new( + CowRef::Input(xml.as_ref()), + true, + XmlVersion::V1_0, + decoder, + ); let data: $type = Deserialize::deserialize(de).unwrap(); assert_eq!(data, $result); @@ -806,7 +798,12 @@ mod tests { fn $name() { let decoder = Decoder::$encoding(); let xml = $xml; - let de = SimpleTypeDeserializer::new(CowRef::Input(xml.as_ref()), true, decoder); + let de = SimpleTypeDeserializer::new( + CowRef::Input(xml.as_ref()), + true, + XmlVersion::V1_0, + decoder, + ); let data: $type = Deserialize::deserialize(de).unwrap(); assert_eq!(data, $result); @@ -831,7 +828,12 @@ mod tests { fn $name() { let decoder = Decoder::$encoding(); let xml = $xml; - let de = SimpleTypeDeserializer::new(CowRef::Input(xml.as_ref()), true, decoder); + let de = SimpleTypeDeserializer::new( + CowRef::Input(xml.as_ref()), + true, + XmlVersion::V1_0, + decoder, + ); let err = <$type as Deserialize>::deserialize(de).unwrap_err(); match err { @@ -902,7 +904,6 @@ mod tests { fn $name() { let de = AtomicDeserializer { content: CowRef::Input($input), - escaped: true, }; let data: $type = Deserialize::deserialize(de).unwrap(); @@ -919,11 +920,10 @@ mod tests { fn $name() { let de = AtomicDeserializer { content: CowRef::Input($input), - escaped: true, }; let data: $type = Deserialize::deserialize(de).unwrap(); - assert_eq!(data, $result); + assert_eq!(data, $result, "deserialization failed"); // Roundtrip to ensure that serializer corresponds to deserializer let mut buffer = String::new(); @@ -935,7 +935,7 @@ mod tests { write_delimiter: false, }) .unwrap(); - assert_eq!(buffer, $input); + assert_eq!(buffer, $input, "serialization failed"); assert_eq!(has_written, !buffer.is_empty()); } }; @@ -949,7 +949,6 @@ mod tests { fn $name() { let de = AtomicDeserializer { content: CowRef::Input($input), - escaped: true, }; let err = <$type as Deserialize>::deserialize(de).unwrap_err(); @@ -986,17 +985,20 @@ mod tests { deserialized_to!(f64_: f64 = "1.23" => 1.23); deserialized_to!(char_unescaped: char = "h" => 'h'); - deserialized_to!(char_escaped: char = "<" => '<'); + err!(char_escaped: char = "<" + => Custom("invalid value: string \"<\", expected a character")); - deserialized_to!(string: String = "<escaped string" => " "<escaped string"); // Serializer will escape space. Because borrowing has meaning only for deserializer, // no need to test roundtrip here, it is already tested with non-borrowing version deserialized_to_only!(borrowed_str: &str = "non-escaped string" => "non-escaped string"); - err!(escaped_str: &str = "escaped string" - => Custom("invalid type: string \"escaped string\", expected a borrowed string")); + deserialized_to_only!(escaped_str: &str = "escaped string" => "escaped string"); err!(byte_buf: ByteBuf = "<escaped string" - => Custom("invalid type: string \" Custom("invalid type: string \"<escaped string\", expected byte data")); err!(borrowed_bytes: Bytes = "non-escaped string" => Custom("invalid type: string \"non-escaped string\", expected borrowed bytes")); @@ -1006,7 +1008,7 @@ mod tests { deserialized_to_only!(unit: () = "anything" => ()); deserialized_to_only!(unit_struct: Unit = "anything" => Unit); - deserialized_to!(newtype_owned: Newtype = "<escaped string" => Newtype(" Newtype("<escaped string".into())); // Serializer will escape space. Because borrowing has meaning only for deserializer, // no need to test roundtrip here, it is already tested with non-borrowing version deserialized_to_only!(newtype_borrowed: BorrowedNewtype = "non-escaped string" @@ -1043,7 +1045,6 @@ mod tests { fn owned_data() { let de = AtomicDeserializer { content: CowRef::Owned("string slice".into()), - escaped: true, }; assert_eq!(de.content.deref(), "string slice"); @@ -1057,7 +1058,6 @@ mod tests { fn borrowed_from_deserializer() { let de = AtomicDeserializer { content: CowRef::Slice("string slice"), - escaped: true, }; assert_eq!(de.content.deref(), "string slice"); @@ -1075,7 +1075,6 @@ mod tests { fn empty() { let mut seq = ListIter { content: Some(Content::Input("")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), None); @@ -1086,7 +1085,6 @@ mod tests { fn only_spaces() { let mut seq = ListIter { content: Some(Content::Input(" ")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), None); @@ -1097,7 +1095,6 @@ mod tests { fn one_item() { let mut seq = ListIter { content: Some(Content::Input("abc")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), Some("abc")); @@ -1109,7 +1106,6 @@ mod tests { fn two_items() { let mut seq = ListIter { content: Some(Content::Input("abc def")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), Some("abc")); @@ -1122,7 +1118,6 @@ mod tests { fn leading_spaces() { let mut seq = ListIter { content: Some(Content::Input(" def")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), Some("def")); @@ -1134,7 +1129,6 @@ mod tests { fn trailing_spaces() { let mut seq = ListIter { content: Some(Content::Input("abc ")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), Some("abc")); @@ -1146,7 +1140,6 @@ mod tests { fn mixed_types() { let mut seq = ListIter { content: Some(Content::Input("string 1.23 42 true false h Unit")), - escaped: true, }; assert_eq!(seq.next_element::<&str>().unwrap(), Some("string")); diff --git a/src/escape.rs b/src/escape.rs index a0fc6a384..a1cec3c5b 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -5,6 +5,7 @@ use std::borrow::Cow; use std::fmt::{self, Write}; use std::num::ParseIntError; use std::ops::Range; +use std::slice::Iter; /// Error of parsing character reference (`&#;` or `&#x;`). #[derive(Clone, Debug, PartialEq)] @@ -51,6 +52,12 @@ pub enum EscapeError { /// Attempt to parse character reference (`&#;` or `&#x;`) /// was unsuccessful, not all characters are decimal or hexadecimal numbers. InvalidCharRef(ParseCharRefError), + /// Expanded more than maximum possible entities during attribute normalization. + /// + /// Attribute normalization includes expanding of general entities (`&entity;`) + /// which replacement text also could contain entities, which is also must be expanded. + /// If more than 128 entities would be expanded, this error is returned. + TooManyNestedEntities, } impl std::fmt::Display for EscapeError { @@ -67,6 +74,9 @@ impl std::fmt::Display for EscapeError { Self::InvalidCharRef(e) => { write!(f, "invalid character reference: {}", e) } + Self::TooManyNestedEntities => { + f.write_str("too many nested entities in an attribute value") + } } } } @@ -461,13 +471,13 @@ pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str // we are sure that index within string normalized.push_str(&text[0..i]); - let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n'); + let mut pos = normalize_xml10_eol_step(&mut normalized, text, i, '\n'); while let Some(i) = memchr(b'\r', &bytes[pos..]) { let index = pos + i; // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because // we are sure that index within string normalized.push_str(&text[pos..index]); - pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n'); + pos = normalize_xml10_eol_step(&mut normalized, text, index, '\n'); } if let Some(rest) = text.get(pos..) { normalized.push_str(rest); @@ -489,12 +499,8 @@ pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str /// - `ch`: a character that should be put to the string instead of newline sequence /// /// [only for]: https://html.spec.whatwg.org/#normalize-newlines -fn normalize_xml10_eol_step( - normalized: &mut String, - input: &[u8], - index: usize, - ch: char, -) -> usize { +fn normalize_xml10_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize { + let input = text.as_bytes(); match input[index] { b'\r' => { normalized.push(ch); @@ -517,6 +523,312 @@ fn normalize_xml10_eol_step( //////////////////////////////////////////////////////////////////////////////////////////////////// +pub(crate) fn normalize_xml10_attribute_value<'input, 'entity, F>( + value: &'input str, + depth: usize, + resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + normalize_attribute_value( + value, + depth, + is_xml10_normalization_char, + normalize_xml10_eol_step, + resolve_entity, + ) +} + +const fn is_xml10_normalization_char(b: &u8) -> bool { + // The following sequences should be translated into a single `\n` (U+000a) character + // to normalize EOLs: + // + // |UTF-8 |String| + // |--------|------| + // |0d 0a |\r\n | + // |0d |\r | + matches!(*b, b'\t' | b'\r' | b'\n' | b'&') +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +pub(crate) fn normalize_xml11_attribute_value<'input, 'entity, F>( + value: &'input str, + depth: usize, + resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + normalize_attribute_value( + value, + depth, + is_xml11_normalization_char, + normalize_xml11_eol_step, + resolve_entity, + ) +} + +const fn is_xml11_normalization_char(b: &u8) -> bool { + // The following sequences should be translated into a single `\n` (U+000a) character + // to normalize EOLs: + // + // |UTF-8 |String| + // |--------|------| + // |0d 0a |\r\n | + // |0d c2 85|\r\x85| + // |0d |\r | + // |c2 85 |\x85 | + // |e2 80 a8|\x2028| + matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&') +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Returns the attribute value normalized as per [the XML specification], +/// using a custom entity resolver. +/// +/// Do not use this method with HTML attributes. +/// +/// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` +/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function +/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still +/// take precedence. +/// +/// This will allocate unless the raw attribute value does not require normalization. +/// +/// # Parameters +/// +/// - `value`: unnormalized attribute value +/// - `depth`: maximum number of nested entities that can be expanded. If expansion +/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] +/// - `is_normalization_char`: a function to check if byte is the start byte of character +/// that should be normalized (UTF-8 encoding is assumed) +/// - `normalize_eol_step`: a function that performs EOL normalization of a character +/// - `resolve_entity`: a function to resolve entity. This function could be called +/// multiple times on the same input and can return different values in each case +/// for the same input, although it is not recommended +/// +/// # Lifetimes +/// +/// - `'input`: lifetime of the unnormalized attribute. If normalization is not required, +/// the input returned unchanged with the same lifetime +/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine +/// +/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize +pub fn normalize_attribute_value<'input, 'entity, C, E, F>( + value: &'input str, + depth: usize, + is_normalization_char: C, + normalize_eol_step: E, + mut resolve_entity: F, +) -> Result, EscapeError> +where + C: Fn(&u8) -> bool, + E: Fn(&mut String, &str, usize, char) -> usize, + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + let mut iter = value.as_bytes().iter(); + + // If we found the character that requires normalization, create a normalized + // version of the attribute, otherwise return the value unchanged + if let Some(i) = iter.position(&is_normalization_char) { + let mut normalized = String::with_capacity(value.len()); + let pos = normalize_attr_step( + &mut normalized, + &mut iter, + value, + 0, + i, + depth, + &is_normalization_char, + &normalize_eol_step, + &mut resolve_entity, + )?; + + normalize_attr_steps( + &mut normalized, + &mut iter, + value, + pos, + depth, + &is_normalization_char, + &normalize_eol_step, + &mut resolve_entity, + )?; + return Ok(normalized.into()); + } + Ok(Cow::Borrowed(value)) +} + +fn normalize_attr_steps<'entity, C, E, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + mut pos: usize, + depth: usize, + is_normalization_char: &C, + normalize_eol_step: &E, + resolve_entity: &mut F, +) -> Result<(), EscapeError> +where + C: Fn(&u8) -> bool, + E: Fn(&mut String, &str, usize, char) -> usize, + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + while let Some(i) = iter.position(is_normalization_char) { + pos = normalize_attr_step( + normalized, + iter, + input, + pos, + pos + i, + depth, + is_normalization_char, + normalize_eol_step, + resolve_entity, + )?; + } + if let Some(rest) = input.get(pos..) { + normalized.push_str(rest); + } + Ok(()) +} + +/// Performs one step of the [normalization algorithm] (but with recursive part): +/// +/// 1. For a character reference, append the referenced character +/// to the normalized value. +/// 2. For an entity reference, recursively apply this algorithm +/// to the replacement text of the entity. +/// 3. For a white space character (#x20, #xD, #xA, #x9), append +/// a space character (#x20) to the normalized value. +/// 4. For another character, append the character to the normalized value. +/// +/// Because [according to the specification], XML parser should parse line-of-end +/// normalized input, but quick-xml does not do that, this function also performs +/// normalization of EOL characters. That should be done before expanding entities +/// and character references, so cannot be processed later. +/// +/// This function could be used also just to normalize line ends if the iterator +/// won't be stop on `&` characters. +/// +/// # Parameters +/// +/// - `normalized`: Output of the algorithm. Normalized value will be placed here +/// - `iter`: Iterator over bytes of `input` +/// - `input`: Original non-normalized value +/// - `last_pos`: Index of the last byte in `input` that was processed +/// - `index`: Index of the byte in `input` that should be processed now +/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space +/// so this parameter tracks if we seen the `\r` before processing the current byte +/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm +/// - `is_normalization_char`: a function to check if byte is the start byte of character +/// that should be normalized (UTF-8 encoding is assumed) +/// - `normalize_eol_step`: a function that performs EOL normalization of a character +/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities +/// +/// # Lifetimes +/// +/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine +/// +/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize +/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends +fn normalize_attr_step<'entity, C, E, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + last_pos: usize, + index: usize, + depth: usize, + is_normalization_char: &C, + normalize_eol_step: &E, + resolve_entity: &mut F, +) -> Result +where + C: Fn(&u8) -> bool, + E: Fn(&mut String, &str, usize, char) -> usize, + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + if depth == 0 { + return Err(EscapeError::TooManyNestedEntities); + } + // 4. For another character, append the character to the normalized value. + normalized.push_str(&input[last_pos..index]); + + match input.as_bytes()[index] { + b'&' => { + let start = index + 1; // +1 - skip `&` + let end = start + + match iter.position(|&b| b == b';') { + Some(end) => end, + None => return Err(EscapeError::UnterminatedEntity(index..input.len())), + }; + + // Content between & and ; - &pat; + // Note, that this content have non-normalized EOLs as required by the specification, + // but because numbers in any case cannot have spaces inside, this is not the problem. + // Normalization of spaces in entity references and checking that they corresponds to + // [`Name`] production on conscience `resolve_entity`. + // + // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name + let pat = &input[start..end]; + // 1. For a character reference, append the referenced character + // to the normalized value. + if pat.starts_with('#') { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + } else + // Special case: '&' resolves to '&' and if follow this algorithm + // without special handling, we got unterminated entity error + if pat == "amp" { + normalized.push('&'); + } else + // 2. For an entity reference, recursively apply this algorithm + // to the replacement text of the entity. + if let Some(value) = resolve_entity(pat) { + normalize_attr_steps( + normalized, + &mut value.as_bytes().iter(), + value, + 0, + depth.saturating_sub(1), + is_normalization_char, + normalize_eol_step, + resolve_entity, + )?; + } else { + return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string())); + } + Ok(end + 1) // +1 - skip `;` + } + // 3. For a white space character (#x20, #xD, #xA, #x9), append + // a space character (#x20) to the normalized value. + // Space character (#x20) has no special meaning, so it is handled on step 4 + b'\t' => { + normalized.push(' '); + Ok(index + 1) // +1 - skip \t + } + _ => { + let pos = normalize_eol_step(normalized, input, index, ' '); + // We should advance iterator because we may skip several characters + for _ in 0..pos - index - 1 { + iter.next(); + } + Ok(pos) + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Resolves predefined XML entities or all HTML5 entities depending on the feature /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html). /// @@ -2285,4 +2597,283 @@ mod normalization { } } } + + mod attribute { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!( + normalize_xml10_attribute_value("", 5, |_| { None }), + Ok("".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("", 5, |_| { None }), + Ok("".into()) + ); + } + + #[test] + fn only_spaces() { + assert_eq!( + normalize_xml10_attribute_value(" ", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_xml11_attribute_value(" ", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\t\t\t", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\t\t\t", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\r\r\r", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\r\r\r", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\n\n\n", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\n\n\n", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\r\n\r\n\r\n", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\r\n\r\n\r\n", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\u{0085}\u{0085}\u{0085}", 5, |_| { None }), + Ok("\u{0085}\u{0085}\u{0085}".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\u{0085}\u{0085}\u{0085}", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\r\u{0085}\r\u{0085}\r\u{0085}", 5, |_| { None }), + Ok(" \u{0085} \u{0085} \u{0085}".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\r\u{0085}\r\u{0085}\r\u{0085}", 5, |_| { None }), + Ok(" ".into()) + ); + + assert_eq!( + normalize_xml10_attribute_value("\u{2028}\u{2028}\u{2028}", 5, |_| { None }), + Ok("\u{2028}\u{2028}\u{2028}".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("\u{2028}\u{2028}\u{2028}", 5, |_| { None }), + Ok(" ".into()) + ); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_xml10_attribute_value("already normalized", 5, |_| { None }), + Ok("already normalized".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("already normalized", 5, |_| { None }), + Ok("already normalized".into()) + ); + } + + #[test] + fn characters() { + assert_eq!( + normalize_xml10_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + assert_eq!( + normalize_xml10_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + + assert_eq!( + normalize_xml11_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + } + + #[test] + fn entities() { + assert_eq!( + normalize_xml10_attribute_value("string with &entity; reference", 5, |_| { + Some("replacement") + }), + Ok("string with replacement reference".into()) + ); + assert_eq!( + normalize_xml10_attribute_value("string with &entity-1; reference", 5, |entity| { + match entity { + "entity-1" => Some("recursive &entity-2;"), + "entity-2" => Some("entity 2"), + _ => None, + } + }), + Ok("string with recursive entity 2 reference".into()) + ); + // Special case: '&' should not treated as unterminated reference, but everything '&...' should + assert_eq!( + normalize_xml10_attribute_value( + "string with &entity;amp; reference", + 5, + |entity| { + match entity { + "entity" => Some("&"), + "amp" => Some("&"), + _ => None, + } + } + ), + Ok("string with & reference".into()) + ); + + assert_eq!( + normalize_xml11_attribute_value("string with &entity; reference", 5, |_| { + Some("replacement") + }), + Ok("string with replacement reference".into()) + ); + assert_eq!( + normalize_xml11_attribute_value("string with &entity-1; reference", 5, |entity| { + match entity { + "entity-1" => Some("recursive &entity-2;"), + "entity-2" => Some("entity 2"), + _ => None, + } + }), + Ok("string with recursive entity 2 reference".into()) + ); + // Special case: '&' should not treated as unterminated reference, but everything '&...' should + assert_eq!( + normalize_xml11_attribute_value( + "string with &entity;amp; reference", + 5, + |entity| { + match entity { + "entity" => Some("&"), + "amp" => Some("&"), + _ => None, + } + } + ), + Ok("string with & reference".into()) + ); + } + + #[test] + fn unclosed_entity() { + assert_eq!( + normalize_xml10_attribute_value( + "string with unclosed &entity reference", + // ^ = 21 ^ = 38 + 5, + |_| Some("replacement") + ), + Err(EscapeError::UnterminatedEntity(21..38)) + ); + assert_eq!( + normalize_xml10_attribute_value( + "string with unclosed (character) reference", + // ^ = 21 ^ = 47 + 5, + |_| None + ), + Err(EscapeError::UnterminatedEntity(21..47)) + ); + + assert_eq!( + normalize_xml11_attribute_value( + "string with unclosed &entity reference", + // ^ = 21 ^ = 38 + 5, + |_| Some("replacement") + ), + Err(EscapeError::UnterminatedEntity(21..38)) + ); + assert_eq!( + normalize_xml11_attribute_value( + "string with unclosed (character) reference", + // ^ = 21 ^ = 47 + 5, + |_| None + ), + Err(EscapeError::UnterminatedEntity(21..47)) + ); + } + + #[test] + fn unknown_entity() { + assert_eq!( + normalize_xml10_attribute_value( + "string with unknown &entity; reference", + // ^ ^ = 21..27 + 5, + |_| None + ), + Err(EscapeError::UnrecognizedEntity( + 21..27, + "entity".to_string(), + )) + ); + + assert_eq!( + normalize_xml11_attribute_value( + "string with unknown &entity; reference", + // ^ ^ = 21..27 + 5, + |_| None + ), + Err(EscapeError::UnrecognizedEntity( + 21..27, + "entity".to_string(), + )) + ); + } + + #[test] + fn recursive_entity() { + assert_eq!( + normalize_xml10_attribute_value("&entity; reference", 5, |_| Some( + "recursive &entity;" + )), + Err(EscapeError::TooManyNestedEntities), + ); + + assert_eq!( + normalize_xml11_attribute_value("&entity; reference", 5, |_| Some( + "recursive &entity;" + )), + Err(EscapeError::TooManyNestedEntities), + ); + } + } } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index f0c73d2a1..9798d8467 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,9 +4,10 @@ use crate::encoding::Decoder; use crate::errors::Result as XmlResult; -use crate::escape::{escape, resolve_predefined_entity, unescape_with}; +use crate::escape::{escape, resolve_predefined_entity}; use crate::name::{LocalName, Namespace, NamespaceResolver, QName}; use crate::utils::{is_whitespace, Bytes}; +use crate::XmlVersion; use std::fmt::{self, Debug, Display, Formatter}; use std::iter::FusedIterator; @@ -15,11 +16,11 @@ use std::{borrow::Cow, ops::Range}; /// A struct representing a key/value XML attribute. /// /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely -/// want to access the value using one of the [`unescape_value`] and [`decode_and_unescape_value`] +/// want to access the value using one of the [`normalized_value`] and [`decoded_and_normalized_value`] /// functions. /// -/// [`unescape_value`]: Self::unescape_value -/// [`decode_and_unescape_value`]: Self::decode_and_unescape_value +/// [`normalized_value`]: Self::normalized_value +/// [`decoded_and_normalized_value`]: Self::decoded_and_normalized_value #[derive(Clone, Eq, PartialEq)] pub struct Attribute<'a> { /// The key to uniquely define the attribute. @@ -31,7 +32,250 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per [the XML specification] (or [for 1.0]). + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// The following escape sequences are replaced with their unescaped equivalents: + /// + /// | Escape Sequence | Replacement + /// |-----------------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, although you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` (only XML 1.1) + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` (only XML 1.1) + /// - `\x2028` (only XML 1.1) + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`normalized_value_with()`](Self::normalized_value_with). + /// + ///
+ /// + /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, + /// should only be used by applications. + /// Libs should use [`decoded_and_normalized_value()`](Self::decoded_and_normalized_value) + /// instead, because if lib will be used in a project which depends on quick_xml with + /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. + /// + ///
+ /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [for 1.0]: https://www.w3.org/TR/xml/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`encoding`]: ../../index.html#encoding + /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value(&self, version: XmlVersion) -> XmlResult> { + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enough + self.normalized_value_with(version, 1, resolve_predefined_entity) + } + + /// Returns the attribute value normalized as per [the XML specification] (or [for 1.0]), + /// using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// A function for resolving entities can be provided as `resolve_entity`. + /// This method does not resolve any predefined entities, but you can use + /// [`resolve_predefined_entity`] in your function. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, although you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` (only XML 1.1) + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` (only XML 1.1) + /// - `\x2028` (only XML 1.1) + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`normalized_value()`](Self::normalized_value). + /// + ///
+ /// + /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, + /// should only be used by applications. + /// Libs should use [`decoded_and_normalized_value_with()`](Self::decoded_and_normalized_value_with) + /// instead, because if lib will be used in a project which depends on quick_xml with + /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. + /// + ///
+ /// + /// # Parameters + /// + /// - `depth`: maximum number of nested entities that can be expanded. If expansion + /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] + /// - `resolve_entity`: a function to resolve entity. This function could be called + /// multiple times on the same input and can return different values in each case + /// for the same input, although it is not recommended + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [for 1.0]: https://www.w3.org/TR/xml/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`encoding`]: ../../index.html#encoding + /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification + /// [`EscapeError::TooManyNestedEntities`]: crate::escape::EscapeError::TooManyNestedEntities + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value_with<'entity>( + &self, + version: XmlVersion, + depth: usize, + resolve_entity: impl FnMut(&str) -> Option<&'entity str>, + ) -> XmlResult> { + use crate::encoding::EncodingError; + use std::str::from_utf8; + + let decoded = match &self.value { + Cow::Borrowed(bytes) => Cow::Borrowed(from_utf8(bytes).map_err(EncodingError::Utf8)?), + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => { + Cow::Owned(from_utf8(bytes).map_err(EncodingError::Utf8)?.to_owned()) + } + }; + + match version.normalize_attribute_value(&decoded, depth, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification] (or [for 1.0]). + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// The following escape sequences are replaced with their unescaped equivalents: + /// + /// | Escape Sequence | Replacement + /// |-----------------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, although you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` (only XML 1.1) + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` (only XML 1.1) + /// - `\x2028` (only XML 1.1) + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`decoded_and_normalized_value_with()`](#method.decoded_and_normalized_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [for 1.0]: https://www.w3.org/TR/xml/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + pub fn decoded_and_normalized_value( + &self, + version: XmlVersion, + decoder: Decoder, + ) -> XmlResult> { + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enough + self.decoded_and_normalized_value_with(version, decoder, 1, resolve_predefined_entity) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification] (or [for 1.0]), using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// A function for resolving entities can be provided as `resolve_entity`. + /// This method does not resolve any predefined entities, but you can use + /// [`resolve_predefined_entity`] in your function. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, although you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` (only XML 1.1) + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` (only XML 1.1) + /// - `\x2028` (only XML 1.1) + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`decoded_and_normalized_value()`](#method.decoded_and_normalized_value) + /// + /// # Parameters + /// + /// - `depth`: maximum number of nested entities that can be expanded. If expansion + /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] + /// - `resolve_entity`: a function to resolve entity. This function could be called + /// multiple times on the same input and can return different values in each case + /// for the same input, although it is not recommended + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [for 1.0]: https://www.w3.org/TR/xml/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`EscapeError::TooManyNestedEntities`]: crate::escape::EscapeError::TooManyNestedEntities + pub fn decoded_and_normalized_value_with<'entity>( + &self, + version: XmlVersion, + decoder: Decoder, + depth: usize, + resolve_entity: impl FnMut(&str) -> Option<&'entity str>, + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => decoder.decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(), + }; + + match version.normalize_attribute_value(&decoded, depth, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -44,7 +288,7 @@ impl<'a> Attribute<'a> { /// /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, /// should only be used by applications. - /// Libs should use [`decode_and_unescape_value()`](Self::decode_and_unescape_value) + /// Libs should use [`decoded_and_normalized_value()`](Self::decoded_and_normalized_value) /// instead, because if lib will be used in a project which depends on quick_xml with /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. /// @@ -53,8 +297,10 @@ impl<'a> Attribute<'a> { /// [`encoding`]: ../../index.html#encoding /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification #[cfg(any(doc, not(feature = "encoding")))] + #[deprecated = "use `Self::normalized_value()`"] pub fn unescape_value(&self) -> XmlResult> { - self.unescape_value_with(resolve_predefined_entity) + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enough + self.normalized_value_with(XmlVersion::V1_0, 1, resolve_predefined_entity) } /// Decodes using UTF-8 then unescapes the value, using custom entities. @@ -72,7 +318,7 @@ impl<'a> Attribute<'a> { /// /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, /// should only be used by applications. - /// Libs should use [`decode_and_unescape_value_with()`](Self::decode_and_unescape_value_with) + /// Libs should use [`decoded_and_normalized_value_with()`](Self::decoded_and_normalized_value_with) /// instead, because if lib will be used in a project which depends on quick_xml with /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. /// @@ -81,38 +327,41 @@ impl<'a> Attribute<'a> { /// [`encoding`]: ../../index.html#encoding /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification #[cfg(any(doc, not(feature = "encoding")))] + #[deprecated = "use `Self::normalized_value_with()`"] #[inline] pub fn unescape_value_with<'entity>( &self, resolve_entity: impl FnMut(&str) -> Option<&'entity str>, ) -> XmlResult> { - self.decode_and_unescape_value_with(Decoder::utf8(), resolve_entity) + self.normalized_value_with(XmlVersion::V1_0, 128, resolve_entity) } /// Decodes then unescapes the value. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. + #[deprecated = "use `Self::decoded_and_normalized_value()`"] pub fn decode_and_unescape_value(&self, decoder: Decoder) -> XmlResult> { - self.decode_and_unescape_value_with(decoder, resolve_predefined_entity) + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enough + self.decoded_and_normalized_value_with( + XmlVersion::V1_0, + decoder, + 1, + resolve_predefined_entity, + ) } /// Decodes then unescapes the value with custom entities. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. + #[deprecated = "use `Self::decoded_and_normalized_value_with()`"] pub fn decode_and_unescape_value_with<'entity>( &self, decoder: Decoder, resolve_entity: impl FnMut(&str) -> Option<&'entity str>, ) -> XmlResult> { - let decoded = decoder.decode_cow(&self.value)?; - - match unescape_with(&decoded, resolve_entity)? { - // Because result is borrowed, no replacements was done and we can use original string - Cow::Borrowed(_) => Ok(decoded), - Cow::Owned(s) => Ok(s.into()), - } + self.decoded_and_normalized_value_with(XmlVersion::V1_0, decoder, 128, resolve_entity) } /// If attribute value [represents] valid boolean values, returns `Some`, otherwise returns `None`. @@ -1010,6 +1259,160 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + mod attribute_value_normalization { + use super::*; + use crate::errors::Error; + use crate::escape::EscapeError::*; + use crate::XmlVersion::*; + use pretty_assertions::assert_eq; + + /// Empty values returned are unchanged + #[test] + fn empty() { + let raw_value = "".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + let value = attr + .decoded_and_normalized_value(V1_0, Decoder::utf8()) + .unwrap(); + assert_eq!(value, ""); + // assert_eq! does not check if value is borrowed, but this is important + assert!(matches!(value, Cow::Borrowed(_))); + + let value = attr + .decoded_and_normalized_value(V1_1, Decoder::utf8()) + .unwrap(); + assert_eq!(value, ""); + // assert_eq! does not check if value is borrowed, but this is important + assert!(matches!(value, Cow::Borrowed(_))); + } + + /// Already normalized values are returned unchanged + #[test] + fn already_normalized() { + let raw_value = "foobar123".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + let value = attr + .decoded_and_normalized_value(V1_0, Decoder::utf8()) + .unwrap(); + assert_eq!(value, "foobar123"); + // assert_eq! does not check if value is borrowed, but this is important + assert!(matches!(value, Cow::Borrowed(_))); + + let value = attr + .decoded_and_normalized_value(V1_1, Decoder::utf8()) + .unwrap(); + assert_eq!(value, "foobar123"); + // assert_eq! does not check if value is borrowed, but this is important + assert!(matches!(value, Cow::Borrowed(_))); + } + + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with + /// a space character, \r\n and \r\u{85} should be replaced by one space in 1.1 + #[test] + fn space_replacement() { + let raw_value = "\r\nfoo\u{85}\u{2028}\rbar\tbaz\n\ndelta\n\r\u{85}".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + assert_eq!( + attr.decoded_and_normalized_value(V1_0, Decoder::utf8()) + .unwrap(), + " foo\u{85}\u{2028} bar baz delta \u{85}" + ); + assert_eq!( + attr.decoded_and_normalized_value(V1_1, Decoder::utf8()) + .unwrap(), + " foo bar baz delta " + ); + } + + /// Entities must be terminated + #[test] + fn unterminated_entity() { + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + match attr.decoded_and_normalized_value(V1_0, Decoder::utf8()) { + Err(Error::Escape(err)) => assert_eq!(err, UnterminatedEntity(3..11)), + x => panic!("Expected Err(Escape(_)), got {:?}", x), + } + + match attr.decoded_and_normalized_value(V1_1, Decoder::utf8()) { + Err(Error::Escape(err)) => assert_eq!(err, UnterminatedEntity(3..11)), + x => panic!("Expected Err(Escape(_)), got {:?}", x), + } + } + + /// Unknown entities raise error + #[test] + fn unrecognized_entity() { + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + match attr.decoded_and_normalized_value(V1_0, Decoder::utf8()) { + // TODO: is this divergence between range behavior of UnterminatedEntity + // and UnrecognizedEntity appropriate? existing unescape code behaves the same. (see: start index) + Err(Error::Escape(err)) => { + assert_eq!(err, UnrecognizedEntity(4..8, "unkn".to_owned())) + } + x => panic!("Expected Err(Escape(err)), got {:?}", x), + } + match attr.decoded_and_normalized_value(V1_1, Decoder::utf8()) { + // TODO: is this divergence between range behavior of UnterminatedEntity + // and UnrecognizedEntity appropriate? existing unescape code behaves the same. (see: start index) + Err(Error::Escape(err)) => { + assert_eq!(err, UnrecognizedEntity(4..8, "unkn".to_owned())) + } + x => panic!("Expected Err(Escape(err)), got {:?}", x), + } + } + + /// custom entity replacement works, entity replacement text processed recursively + #[test] + fn entity_replacement() { + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some(" "), + "a" => Some(" "), + "da" => Some(" "), + _ => None, + } + } + + assert_eq!( + attr.decoded_and_normalized_value_with(V1_0, Decoder::utf8(), 5, &custom_resolver) + .unwrap(), + "\r\rA\n \nB\r\n" + ); + assert_eq!( + attr.decoded_and_normalized_value_with(V1_1, Decoder::utf8(), 5, &custom_resolver) + .unwrap(), + "\r\rA\n \nB\r\n" + ); + } + + #[test] + fn char_references() { + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + + assert_eq!( + attr.decoded_and_normalized_value(V1_0, Decoder::utf8()) + .unwrap(), + "\r\rA\n\nB\r\n" + ); + assert_eq!( + attr.decoded_and_normalized_value(V1_1, Decoder::utf8()) + .unwrap(), + "\r\rA\n\nB\r\n" + ); + } + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/lib.rs b/src/lib.rs index 46d69da19..bd37f91d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,6 +71,8 @@ pub mod serde_helpers; pub mod utils; pub mod writer; +use std::borrow::Cow; + // reexports pub use crate::encoding::Decoder; #[cfg(feature = "serialize")] @@ -91,6 +93,24 @@ pub enum XmlVersion { V1_1, } +impl XmlVersion { + pub(crate) fn normalize_attribute_value<'input, 'entity, F>( + &self, + value: &'input str, + depth: usize, + resolve_entity: F, + ) -> std::result::Result, escape::EscapeError> + where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, + { + match self { + Self::V1_0 => escape::normalize_xml10_attribute_value(value, depth, resolve_entity), + Self::V1_1 => escape::normalize_xml11_attribute_value(value, depth, resolve_entity), + } + } +} + impl Default for XmlVersion { #[inline] fn default() -> Self { diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index 90c72f18c..75555ddf0 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -3,6 +3,7 @@ use quick_xml::errors::{Error, IllFormedError}; use quick_xml::events::Event; use quick_xml::reader::Reader; +use quick_xml::XmlVersion; #[test] fn fuzz_53() { @@ -31,7 +32,8 @@ fn fuzz_101() { Ok(Event::Start(e)) | Ok(Event::Empty(e)) => { for a in e.attributes() { if a.ok().map_or(true, |a| { - a.decode_and_unescape_value(reader.decoder()).is_err() + a.decoded_and_normalized_value(XmlVersion::V1_0, reader.decoder()) + .is_err() }) { break; } diff --git a/tests/serde-de.rs b/tests/serde-de.rs index ac35ff63f..7f1850a3a 100644 --- a/tests/serde-de.rs +++ b/tests/serde-de.rs @@ -1563,7 +1563,8 @@ mod xml_schema_lists { list!(string: String = r#""# => vec![ "first".to_string(), "second".to_string(), - "third 3".to_string(), + "third".to_string(), + "3".to_string(), ]); err!(byte_buf: ByteBuf = r#""# => Custom("invalid type: string \"first\", expected byte data")); @@ -1979,6 +1980,9 @@ mod xml_version { #[derive(Debug, Deserialize, PartialEq)] struct Root { + #[serde(rename = "@attribute")] + attribute: String, + #[serde(rename = "$text")] text: String, } @@ -1988,11 +1992,12 @@ mod xml_version { assert_eq!( from_str::( "\ - \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ " ) .unwrap(), Root { + attribute: " , , , \u{0085},\u{0085},\u{2028}".to_string(), text: "\n,\n,\n,\n\u{0085},\u{0085},\u{2028}".to_string(), } ); @@ -2004,11 +2009,12 @@ mod xml_version { from_str::( "\ \ - \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ " ) .unwrap(), Root { + attribute: " , , , \u{0085},\u{0085},\u{2028}".to_string(), text: "\n,\n,\n,\n\u{0085},\u{0085},\u{2028}".to_string(), } ); @@ -2020,11 +2026,12 @@ mod xml_version { from_str::( "\ \ - \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ " ) .unwrap(), Root { + attribute: " , , , , , ".to_string(), text: "\n,\n,\n,\n,\n,\n".to_string(), } ); @@ -2035,7 +2042,7 @@ mod xml_version { match from_str::( "\ \ - \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ + \r\n,\n,\r,\r\u{0085},\u{0085},\u{2028}\ ", ) { Err(DeError::InvalidXml(Error::IllFormed(cause))) => { diff --git a/tests/serde-issues.rs b/tests/serde-issues.rs index 99496c934..8f962d62a 100644 --- a/tests/serde-issues.rs +++ b/tests/serde-issues.rs @@ -539,6 +539,25 @@ fn issue655() { ); } +/// Regression test for https://github.com/tafia/quick-xml/issues/674 +#[test] +fn issue674() { + #[derive(Debug, Deserialize)] + struct Any { + #[serde(rename = "@list")] + list: Vec, + } + + #[derive(Debug, PartialEq, Deserialize)] + enum Item { + Foo, + Bar, + } + + let any: Any = from_str("").unwrap(); + assert_eq!(any.list, [Item::Foo, Item::Bar]); +} + /// Regression test for https://github.com/tafia/quick-xml/issues/683. #[test] fn issue683() {