From 623c92cfb8908d46437e5a76a4fe06b2365898a5 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 15 Nov 2025 18:14:11 +0500 Subject: [PATCH 1/7] Rewrite `read_bang_element` with the same style as `read_with`, `read_ref` and `read_text` (review with with whitespace ignored mode) --- src/reader/buffered_reader.rs | 42 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 32aa313e..129675ed 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -256,35 +256,33 @@ macro_rules! impl_buffered_source { }; loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF + let available = match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) if n.is_empty() => break, - Ok(available) => { - // We only parse from start because we don't want to consider - // whatever is in the buffer before the bang element - if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used as u64; - - *position += read; - return Ok((bang_type, &buf[start..])); - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used as u64; - } - } + Ok(n) => n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => { *position += read; return Err(Error::Io(e.into())); } + }; + // We only parse from start because we don't want to consider + // whatever is in the buffer before the bang element + if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) { + buf.extend_from_slice(consumed); + + self $(.$reader)? .consume(used); + read += used as u64; + + *position += read; + return Ok((bang_type, &buf[start..])); } + + // The `>` symbol not yet found, continue reading + buf.extend_from_slice(available); + + let used = available.len(); + self $(.$reader)? .consume(used); + read += used as u64; } *position += read; From e3230c24f35b41792b5a23fd46eb8f73ab402781 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 15 Nov 2025 18:34:15 +0500 Subject: [PATCH 2/7] Append +1 outside of BangType, in read_bang_element, like read_with do --- src/parser/dtd.rs | 6 ++---- src/reader/buffered_reader.rs | 5 +++-- src/reader/mod.rs | 12 ++++++------ src/reader/slice_reader.rs | 7 ++++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/parser/dtd.rs b/src/parser/dtd.rs index 3b10758d..d72f73b3 100644 --- a/src/parser/dtd.rs +++ b/src/parser/dtd.rs @@ -103,8 +103,7 @@ impl DtdParser { b'>' => { *self = Self::Finished; let len = chunk.len() - cur.len() + i; - // +1 for `>` - return Some((&chunk[..len], len + 1)); + return Some((&chunk[..len], len)); } _ => {} } @@ -146,8 +145,7 @@ impl DtdParser { if let Some(i) = memchr::memchr(b'>', cur) { *self = Self::Finished; let len = chunk.len() - cur.len() + i; - // +1 for `>` - return Some((&chunk[..len], len + 1)); + return Some((&chunk[..len], len)); } break; } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 129675ed..2bee5918 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -267,7 +267,8 @@ macro_rules! impl_buffered_source { }; // We only parse from start because we don't want to consider // whatever is in the buffer before the bang element - if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) { + if let Some((consumed, i)) = bang_type.parse(&buf[start..], available) { + let used = i + 1; // +1 for `>` which we do not include buf.extend_from_slice(consumed); self $(.$reader)? .consume(used); @@ -286,7 +287,7 @@ macro_rules! impl_buffered_source { } *position += read; - Err(bang_type.to_err().into()) + Err(Error::Syntax(bang_type.to_err())) } #[inline] diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 79e94ebd..d8e8c807 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1179,17 +1179,17 @@ impl BangType { // check_comments enabled option. XML standard requires that comment // will not end with `--->` sequence because this is a special case of // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i + 1)); // +1 for `>` + return Some((&chunk[..i], i)); } // End sequence `-|->` was splitted at | // buf --/ \-- chunk if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i + 1)); // +1 for `>` + return Some((&chunk[..i], i)); } // End sequence `--|>` was splitted at | // buf --/ \-- chunk if i == 0 && buf.ends_with(b"--") { - return Some((&[], i + 1)); // +1 for `>` + return Some((&[], i)); } } } @@ -1197,17 +1197,17 @@ impl BangType { Self::CData => { for i in memchr::memchr_iter(b'>', chunk) { if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i], i + 1)); // +1 for `>` + return Some((&chunk[..i], i)); } // End sequence `]|]>` was splitted at | // buf --/ \-- chunk if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { - return Some((&chunk[..i], i + 1)); // +1 for `>` + return Some((&chunk[..i], i)); } // End sequence `]]|>` was splitted at | // buf --/ \-- chunk if i == 0 && buf.ends_with(b"]]") { - return Some((&[], i + 1)); // +1 for `>` + return Some((&[], i)); } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 07a74d18..5e022e15 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -362,13 +362,14 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let mut bang_type = BangType::new(self.get(2).copied())?; if let Some((bytes, i)) = bang_type.parse(&[], self) { - *position += i as u64; - *self = &self[i..]; + let consumed = i + 1; // +1 for `>` which we do not include + *position += consumed as u64; + *self = &self[consumed..]; return Ok((bang_type, bytes)); } *position += self.len() as u64; - Err(bang_type.to_err().into()) + Err(Error::Syntax(bang_type.to_err())) } #[inline] From 241f01e20ff679e9248f2ae424c9ba823fc15444 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 15 Nov 2025 18:39:16 +0500 Subject: [PATCH 3/7] Return only index from BangType::parse (renamed to feed) like in other parsers --- src/parser/dtd.rs | 8 +++----- src/reader/buffered_reader.rs | 10 +++++----- src/reader/mod.rs | 14 +++++++------- src/reader/slice_reader.rs | 3 ++- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/parser/dtd.rs b/src/parser/dtd.rs index d72f73b3..03ea373b 100644 --- a/src/parser/dtd.rs +++ b/src/parser/dtd.rs @@ -69,7 +69,7 @@ impl DtdParser { /// # Parameters (as same as `reader::BangType::parse`) /// - `buf`: buffer with data consumed on previous iterations /// - `chunk`: data read on current iteration and not yet consumed from reader - pub fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { + pub fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option { // This method assumes the DTD is well-formed. // Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs // is not particularly problematic; the only point of interest is reporting well-formed DTDs @@ -102,8 +102,7 @@ impl DtdParser { } b'>' => { *self = Self::Finished; - let len = chunk.len() - cur.len() + i; - return Some((&chunk[..len], len)); + return Some(chunk.len() - cur.len() + i); } _ => {} } @@ -144,8 +143,7 @@ impl DtdParser { Self::AfterInternalSubset => { if let Some(i) = memchr::memchr(b'>', cur) { *self = Self::Finished; - let len = chunk.len() - cur.len() + i; - return Some((&chunk[..len], len)); + return Some(chunk.len() - cur.len() + i); } break; } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 2bee5918..e899d1fe 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -267,12 +267,12 @@ macro_rules! impl_buffered_source { }; // We only parse from start because we don't want to consider // whatever is in the buffer before the bang element - if let Some((consumed, i)) = bang_type.parse(&buf[start..], available) { - let used = i + 1; // +1 for `>` which we do not include - buf.extend_from_slice(consumed); + if let Some(i) = bang_type.feed(&buf[start..], available) { + let consumed = i + 1; // +1 for `>` which we do not include + buf.extend_from_slice(&available[..i]); - self $(.$reader)? .consume(used); - read += used as u64; + self $(.$reader)? .consume(consumed); + read += consumed as u64; *position += read; return Ok((bang_type, &buf[start..])); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index d8e8c807..5b0ef452 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1166,7 +1166,7 @@ impl BangType { /// - `buf`: buffer with data consumed on previous iterations /// - `chunk`: data read on current iteration and not yet consumed from reader #[inline(always)] - fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { + fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option { match self { Self::Comment => { for i in memchr::memchr_iter(b'>', chunk) { @@ -1179,17 +1179,17 @@ impl BangType { // check_comments enabled option. XML standard requires that comment // will not end with `--->` sequence because this is a special case of // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i)); + return Some(i); } // End sequence `-|->` was splitted at | // buf --/ \-- chunk if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i)); + return Some(i); } // End sequence `--|>` was splitted at | // buf --/ \-- chunk if i == 0 && buf.ends_with(b"--") { - return Some((&[], i)); + return Some(i); } } } @@ -1197,17 +1197,17 @@ impl BangType { Self::CData => { for i in memchr::memchr_iter(b'>', chunk) { if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i], i)); + return Some(i); } // End sequence `]|]>` was splitted at | // buf --/ \-- chunk if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { - return Some((&chunk[..i], i)); + return Some(i); } // End sequence `]]|>` was splitted at | // buf --/ \-- chunk if i == 0 && buf.ends_with(b"]]") { - return Some((&[], i)); + return Some(i); } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 5e022e15..62dfc3d8 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -361,9 +361,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let mut bang_type = BangType::new(self.get(2).copied())?; - if let Some((bytes, i)) = bang_type.parse(&[], self) { + if let Some(i) = bang_type.feed(&[], self) { let consumed = i + 1; // +1 for `>` which we do not include *position += consumed as u64; + let bytes = &self[..i]; *self = &self[consumed..]; return Ok((bang_type, bytes)); } From c34af489c6d5fc90b1efde2d497db86fe051f7b9 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 7 Feb 2026 16:31:14 +0500 Subject: [PATCH 4/7] Place `>` to the buffer when read comment, CDATA or DOCTYPE --- src/reader/buffered_reader.rs | 4 ++-- src/reader/mod.rs | 12 +++++----- src/reader/slice_reader.rs | 6 ++--- src/reader/state.rs | 43 +++++++++++++++++++---------------- 4 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index e899d1fe..0f622535 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -268,8 +268,8 @@ macro_rules! impl_buffered_source { // We only parse from start because we don't want to consider // whatever is in the buffer before the bang element if let Some(i) = bang_type.feed(&buf[start..], available) { - let consumed = i + 1; // +1 for `>` which we do not include - buf.extend_from_slice(&available[..i]); + let consumed = i + 1; // +1 for `>` + buf.extend_from_slice(&available[..consumed]); self $(.$reader)? .consume(consumed); read += consumed as u64; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 5b0ef452..9b62d075 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1306,7 +1306,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"")) ); assert_eq!(position, 12); } @@ -1327,7 +1327,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"content]]")) + (BangType::CData, Bytes(b"content]]>")) ); assert_eq!(position, 29); } @@ -1452,7 +1452,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"")) ); assert_eq!(position, 7); } @@ -1470,7 +1470,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"comment<---")) + (BangType::Comment, Bytes(b"comment<--->")) ); assert_eq!(position, 18); } @@ -1531,7 +1531,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(DtdParser::Finished), Bytes(b"")) ); assert_eq!(position, 10); } @@ -1605,7 +1605,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType(DtdParser::Finished), Bytes(b"")) ); assert_eq!(position, 10); } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 62dfc3d8..12d3940c 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -362,10 +362,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let mut bang_type = BangType::new(self.get(2).copied())?; if let Some(i) = bang_type.feed(&[], self) { - let consumed = i + 1; // +1 for `>` which we do not include + let consumed = i + 1; // +1 for `>` *position += consumed as u64; - let bytes = &self[..i]; - *self = &self[consumed..]; + let (bytes, rest) = self.split_at(consumed); + *self = rest; return Ok((bang_type, bytes)); } diff --git a/src/reader/state.rs b/src/reader/state.rs index ef96ba7a..12fa4ac1 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -87,6 +87,11 @@ impl ReaderState { "CDATA, comment or DOCTYPE must start from '"), + "CDATA, comment or DOCTYPE must end with '>':\n{:?}", + crate::utils::Bytes(buf) + ); let uncased_starts_with = |string: &[u8], prefix: &[u8]| { string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) @@ -96,13 +101,13 @@ impl ReaderState { match bang_type { BangType::Comment if buf.starts_with(b""), + "comment must end with '-->':\n{:?}", crate::utils::Bytes(buf) ); if self.config.check_comments { // search if '--' not in comments - let mut haystack = &buf[4..len - 2]; + let mut haystack = &buf[4..len - 3]; let mut off = 0; while let Some(p) = memchr::memchr(b'-', haystack) { off += p + 1; @@ -115,14 +120,14 @@ impl ReaderState { // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~~ : - buf + // ~~~~~~~~~~~~~~~~~~: - buf // : =========== : - zone of search (possible values of `p`) // : |---p : - p is counted from | (| is 0) // : : : ^ - self.offset // ^ : : - self.offset - len - // ^ : - self.offset - len + 3 - // ^ - self.offset - len + 3 + p - self.last_error_offset = self.offset - len as u64 + 3 + p as u64; + // ^ : - self.offset - len + 4 + // ^ - self.offset - len + 4 + p + self.last_error_offset = self.offset - len as u64 + 4 + p as u64; return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } // Continue search after single `-` (+1 to skip it) @@ -130,8 +135,8 @@ impl ReaderState { } } Ok(Event::Comment(BytesText::wrap( - // Cut of `` from start and end + &buf[4..len - 3], self.decoder(), ))) } @@ -141,13 +146,13 @@ impl ReaderState { // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::CData if buf.starts_with(b" { debug_assert!( - buf.ends_with(b"]]"), - "CDATA must end with ']]':\n{:?}", + buf.ends_with(b"]]>"), + "CDATA must end with ']]>':\n{:?}", crate::utils::Bytes(buf) ); Ok(Event::CData(BytesCData::wrap( - // Cut of `` from start and end + &buf[9..len - 3], self.decoder(), ))) } @@ -156,10 +161,10 @@ impl ReaderState { // HTML5 allows mixed case for doctype declarations: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b" { - match buf[9..].iter().position(|&b| !is_whitespace(b)) { + match buf[9..len - 1].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( - // Cut of `` from the end + &buf[9 + start..len - 1], self.decoder(), ))), None => { @@ -173,9 +178,9 @@ impl ReaderState { } _ => { // - // ~~~~~~ - `buf` does not contain `>` and `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 1 and buf len - self.last_error_offset = self.offset - len as u64 - 1; + // ~~~~~~~- `buf` contains that and `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract buf len + self.last_error_offset = self.offset - len as u64; Err(Error::Syntax(bang_type.to_err())) } } From 9a7e8f577a74f323abefc93a495004e88d60fa45 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 26 Jan 2026 00:36:45 +0500 Subject: [PATCH 5/7] Place `>` to the buffer when read elements, processing instructions and XML declarations --- src/reader/buffered_reader.rs | 8 +++--- src/reader/mod.rs | 30 +++++++++++------------ src/reader/slice_reader.rs | 8 +++--- src/reader/state.rs | 46 +++++++++++++++++++++-------------- tests/issues.rs | 4 +-- 5 files changed, 53 insertions(+), 43 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 0f622535..175cb5f1 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -211,11 +211,11 @@ macro_rules! impl_buffered_source { }; if let Some(i) = parser.feed(available) { - buf.extend_from_slice(&available[..i]); + let used = i + 1; // +1 for `>` + buf.extend_from_slice(&available[..used]); - // +1 for `>` which we do not include - self $(.$reader)? .consume(i + 1); - read += i as u64 + 1; + self $(.$reader)? .consume(used); + read += used as u64; *position += read; return Ok(&buf[start..]); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 9b62d075..b1b7dd4e 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1842,7 +1842,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"<") + Bytes(b"<>") ); assert_eq!(position, 2); } @@ -1856,7 +1856,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 5); } @@ -1870,7 +1870,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"<:") + Bytes(b"<:>") ); assert_eq!(position, 3); } @@ -1884,7 +1884,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"<:tag") + Bytes(b"<:tag>") ); assert_eq!(position, 6); } @@ -1898,7 +1898,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#""#) ); assert_eq!(position, 39); } @@ -1917,7 +1917,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 3); } @@ -1931,7 +1931,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 6); } @@ -1945,7 +1945,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"<:/") + Bytes(b"<:/>") ); assert_eq!(position, 4); } @@ -1959,7 +1959,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"<:tag/") + Bytes(b"<:tag/>") ); assert_eq!(position, 7); } @@ -1973,7 +1973,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#""#) ); assert_eq!(position, 42); } @@ -1992,7 +1992,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 4); } @@ -2006,7 +2006,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 6); } @@ -2020,7 +2020,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 4); } @@ -2034,7 +2034,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") ); assert_eq!(position, 7); } @@ -2048,7 +2048,7 @@ mod test { assert_eq!( Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#""#) ); assert_eq!(position, 40); } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 12d3940c..f897dc60 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -338,10 +338,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { P: Parser, { if let Some(i) = parser.feed(self) { - // +1 for `>` which we do not include - *position += i as u64 + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; + let used = i + 1; // +1 for `>` + *position += used as u64; + let (bytes, rest) = self.split_at(used); + *self = rest; return Ok(bytes); } diff --git a/src/reader/state.rs b/src/reader/state.rs index 12fa4ac1..a0ee3b96 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -189,16 +189,21 @@ impl ReaderState { /// Wraps content of `buf` into the [`Event::End`] event. Does the check that /// end name matches the last opened start name if `self.config.check_end_names` is set. /// - /// `buf` contains data between `<` and `>`, for example `/tag`. + /// `buf` contains data between `<` and up to, including, `>`, for example ``. pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result> { debug_assert!( buf.starts_with(b""), + "end tag must end with '>':\n{:?}", + crate::utils::Bytes(buf) + ); - // Strip the `` - let content = &buf[2..]; + // Strip the `` characters. `content` contains data between `` + let content = &buf[2..buf.len() - 1]; // XML standard permits whitespaces after the markup name in closing tags. // Let's strip them from the buffer before comparing tag names. let name = if self.config.trim_markup_names_in_closing_tags { @@ -224,8 +229,7 @@ impl ReaderState { self.opened_buffer.truncate(start); // Report error at start of the end tag at `<` character - // -1 for `>` - self.last_error_offset = self.offset - buf.len() as u64 - 1; + self.last_error_offset = self.offset - buf.len() as u64; return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { expected, found: decoder.decode(name).unwrap_or_default().into_owned(), @@ -238,8 +242,7 @@ impl ReaderState { None => { if !self.config.allow_unmatched_ends { // Report error at start of the end tag at `<` character - // -1 for `>` - self.last_error_offset = self.offset - buf.len() as u64 - 1; + self.last_error_offset = self.offset - buf.len() as u64; return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( decoder.decode(name).unwrap_or_default().into_owned(), ))); @@ -261,17 +264,17 @@ impl ReaderState { crate::utils::Bytes(buf) ); debug_assert!( - buf.ends_with(b"?"), - "processing instruction or XML declaration must end with '?':\n{:?}", + buf.ends_with(b"?>"), + "processing instruction or XML declaration must end with '?>':\n{:?}", crate::utils::Bytes(buf) ); let len = buf.len(); // We accept at least - // ~~~ - len = 3 - if len > 2 { - // Cut of ` 3 { + // Cut of `` from start and end + let content = &buf[2..len - 2]; let len = content.len(); if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { @@ -294,10 +297,10 @@ impl ReaderState { ))) } } else { - // - // ~~~~~~ - `buf` contains that and `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 1 and buf len - self.last_error_offset = self.offset - len as u64 - 1; + // + // ~~~~~~~- `buf` contains that and `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract buf len + self.last_error_offset = self.offset - len as u64; Err(Error::Syntax(PiParser(false).eof_error(buf))) } } @@ -312,10 +315,15 @@ impl ReaderState { "start or empty tag must start from '<':\n{:?}", crate::utils::Bytes(content) ); + debug_assert!( + content.ends_with(b">"), + "start or empty tag must end with '>':\n{:?}", + crate::utils::Bytes(content) + ); // strip `<` let content = &content[1..]; - if let Some(content) = content.strip_suffix(b"/") { + if let Some(content) = content.strip_suffix(b"/>") { // This is self-closed tag `` let event = BytesStart::wrap(content, name_len(content), self.decoder()); @@ -328,6 +336,8 @@ impl ReaderState { Event::Empty(event) } } else { + // strip `>` + let content = &content[..content.len() - 1]; let event = BytesStart::wrap(content, name_len(content), self.decoder()); // #514: Always store names event when .check_end_names == false, diff --git a/tests/issues.rs b/tests/issues.rs index b3e681f1..b4d744a4 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -623,7 +623,7 @@ fn issue939() { ); assert_eq!( quick_xml::utils::Bytes(&buf), - quick_xml::utils::Bytes(b"") ); assert_eq!( quick_xml::utils::Bytes(reader.get_ref().buffer()), @@ -636,7 +636,7 @@ fn issue939() { ); assert_eq!( quick_xml::utils::Bytes(&buf), - quick_xml::utils::Bytes(b"") ); assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); From 489dc17e4080d85885b1a30003ca40a0caea9bdb Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 18 Feb 2026 21:24:55 +0500 Subject: [PATCH 6/7] Place `;` to the buffer when read general entity references --- src/reader/buffered_reader.rs | 4 ++-- src/reader/mod.rs | 7 ++++--- src/reader/slice_reader.rs | 11 ++++++----- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 175cb5f1..ec9e0330 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -146,10 +146,10 @@ macro_rules! impl_buffered_source { match memchr::memchr3(b';', b'&', b'<', available) { Some(i) if available[i] == b';' => { - buf.extend_from_slice(&available[..i]); - // +1 -- skip the end `;` let used = i + 1; + + buf.extend_from_slice(&available[..used]); self $(.$reader)? .consume(used); read += used as u64; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index b1b7dd4e..b8a569b2 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -285,7 +285,8 @@ macro_rules! read_event_impl { ReadRefResult::Ref(bytes) => { $self.state.state = ParseState::InsideText; // +1 to skip start `&` - Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder()))) + // -1 to skip end `;` + Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..bytes.len() - 1], $self.decoder()))) } // Go to Done state ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => { @@ -1781,7 +1782,7 @@ mod test { // ^= 3 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&;")), x => panic!("Expected `Ref(_)`, but got `{:?}`", x), } assert_eq!(position, 3); @@ -1795,7 +1796,7 @@ mod test { // ^= 5 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")), + ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")), x => panic!("Expected `Ref(_)`, but got `{:?}`", x), } assert_eq!(position, 5); diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index f897dc60..2a27a866 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -300,11 +300,12 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { // Search for the end of reference or a start of another reference or a markup match memchr::memchr3(b';', b'&', b'<', &self[1..]) { Some(i) if self[i + 1] == b';' => { - let end = i + 1; - let bytes = &self[..end]; - // +1 -- skip the end `;` - *self = &self[end + 1..]; - *position += end as u64 + 1; + // +1 for the start `&` + // +1 for the end `;` + let end = i + 2; + let (bytes, rest) = self.split_at(end); + *self = rest; + *position += end as u64; ReadRefResult::Ref(bytes) } From f8e8857c46d010e7c2b94f7eea2d6b0229d6056e Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 18 Feb 2026 21:53:11 +0500 Subject: [PATCH 7/7] Implement read_text_into and read_text_into_async MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copy and adapt tests in tests/ directory from borrowed variant Copy фотв adapt documentation of new methods from the read_text methods --- Changelog.md | 3 + src/reader/async_tokio.rs | 153 +++++++++++++- src/reader/buffered_reader.rs | 84 +++++++- src/reader/ns_reader.rs | 82 +++++++- tests/reader-namespaces.rs | 362 ++++++++++++++++++++++++++++++++++ tests/reader-read-text.rs | 323 ++++++++++++++++++++++++++++++ 6 files changed, 1003 insertions(+), 4 deletions(-) diff --git a/Changelog.md b/Changelog.md index 329cc2c0..7db049c5 100644 --- a/Changelog.md +++ b/Changelog.md @@ -16,6 +16,8 @@ ### New Features +- [#483]: Implement `read_text_into()` and `read_text_into_async()`. + ### Bug Fixes - [#939]: Fix parsing error of the tag from buffered reader, when the first byte `<` @@ -24,6 +26,7 @@ ### Misc Changes +[#483]: https://github.com/tafia/quick-xml/issues/483 [#936]: https://github.com/tafia/quick-xml/pull/936 [#939]: https://github.com/tafia/quick-xml/issues/939 diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 2a6f22de..62b97fbd 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -8,7 +8,7 @@ use std::task::{Context, Poll}; use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf}; use crate::errors::{Error, IllFormedError, Result, SyntaxError}; -use crate::events::{BytesRef, Event}; +use crate::events::{BytesRef, BytesText, Event}; use crate::name::{QName, ResolveResult}; use crate::parser::{ElementParser, Parser, PiParser}; use crate::reader::buffered_reader::impl_buffered_source; @@ -180,7 +180,7 @@ impl Reader { /// [`Start`]: Event::Start pub async fn read_to_end_into_async<'n>( &mut self, - // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033` + // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033 end: QName<'n>, buf: &mut Vec, ) -> Result { @@ -196,6 +196,82 @@ impl Reader { )) } + /// An asynchronous version of [`read_text_into()`]. + /// Reads asynchronously until end element is found using provided buffer as + /// intermediate storage for events content. This function is supposed to be + /// called after you already read a [`Start`] event. + /// + /// See the documentation of [`read_text_into()`] for more information. + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # tokio_test::block_on(async { + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::reader::Reader; + /// + /// let mut reader = Reader::from_reader(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// ".as_bytes()); + /// reader.config_mut().trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// let mut buf = Vec::new(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.config_mut().check_end_names = false; + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text_into_async(end.name(), &mut buf).await.unwrap(); + /// let text = text.decode().unwrap(); + /// assert_eq!(text, r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#); + /// assert!(matches!(text, Cow::Borrowed(_))); + /// + /// // Now we can enable checks again + /// reader.config_mut().check_end_names = true; + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Eof); + /// # }) // tokio_test::block_on + /// ``` + /// + /// [`read_text_into()`]: Self::read_text_into + /// [`Start`]: Event::Start + pub async fn read_text_into_async<'n, 'b>( + &mut self, + // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033 + end: QName<'n>, + buf: &'b mut Vec, + ) -> Result> { + let start = buf.len(); + let span = read_to_end!(self, end, buf, read_event_into_async, {}, await); + + let len = span.end - span.start; + // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is + // not cleared when reading event, length of the returned span should fit into + // usize (because otherwise we panic at appending to the buffer before that point) + let end = start + len as usize; + + Ok(BytesText::wrap(&buf[start..end], self.decoder())) + } + /// Private function to read until `>` is found. This function expects that /// it was called just after encounter a `<` symbol. async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { @@ -344,6 +420,79 @@ impl NsReader { Ok(result) } + /// An asynchronous version of [`read_text_into()`]. + /// Reads asynchronously until end element is found using provided buffer as + /// intermediate storage for events content. This function is supposed to be + /// called after you already read a [`Start`] event. + /// + /// See the documentation of [`read_text_into()`] for more information. + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # tokio_test::block_on(async { + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::reader::NsReader; + /// + /// let mut reader = NsReader::from_reader(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// ".as_bytes()); + /// reader.config_mut().trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// let mut buf = Vec::new(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.config_mut().check_end_names = false; + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text_into_async(end.name(), &mut buf).await.unwrap(); + /// let text = text.decode().unwrap(); + /// assert_eq!(text, r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#); + /// assert!(matches!(text, Cow::Borrowed(_))); + /// + /// // Now we can enable checks again + /// reader.config_mut().check_end_names = true; + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Eof); + /// # }) // tokio_test::block_on + /// ``` + /// + /// [`read_text_into()`]: Self::read_text_into + /// [`Start`]: Event::Start + pub async fn read_text_into_async<'n, 'b>( + &mut self, + // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033 + end: QName<'n>, + buf: &'b mut Vec, + ) -> Result> { + // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should + // match literally the start name. See `Config::check_end_names` documentation + let result = self.reader.read_text_into_async(end, buf).await?; + // read_text_into_async will consume closing tag. Because nobody can access to its + // content anymore, we directly pop namespace of the opening tag + self.ns_resolver.pop(); + Ok(result) + } + /// An asynchronous version of [`read_resolved_event_into()`]. Reads the next /// event into given buffer asynchronously and resolves its namespace (if applicable). /// diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index ec9e0330..ee418eb5 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -6,7 +6,7 @@ use std::io::{self, BufRead, BufReader}; use std::path::Path; use crate::errors::{Error, Result}; -use crate::events::Event; +use crate::events::{BytesText, Event}; use crate::name::QName; use crate::parser::Parser; use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource}; @@ -486,6 +486,88 @@ impl Reader { buf.clear(); })) } + + /// Reads content between start and end tags, including any markup using + /// provided buffer as intermediate storage for events content. This function + /// is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the _literally_ + /// same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_text()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::reader::Reader; + /// + /// let mut reader = Reader::from_reader(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// ".as_bytes()); + /// reader.config_mut().trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// let mut buf = Vec::new(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.config_mut().check_end_names = false; + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text_into(end.name(), &mut buf).unwrap(); + /// let text = text.decode().unwrap(); + /// assert_eq!(text, r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#); + /// assert!(matches!(text, Cow::Borrowed(_))); + /// + /// // Now we can enable checks again + /// reader.config_mut().check_end_names = true; + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`read_text()`]: Self::read_text() + pub fn read_text_into<'b>( + &mut self, + end: QName, + buf: &'b mut Vec, + ) -> Result> { + let start = buf.len(); + let span = read_to_end!(self, end, buf, read_event_impl, {}); + + let len = span.end - span.start; + // SAFETY: `buf` may contain not more than isize::MAX bytes and because it is + // not cleared when reading event, length of the returned span should fit into + // usize (because otherwise we panic at appending to the buffer before that point) + let end = start + len as usize; + + Ok(BytesText::wrap(&buf[start..end], self.decoder())) + } } impl Reader> { diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index a054f8d1..2690ae39 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -11,7 +11,7 @@ use std::ops::Deref; use std::path::Path; use crate::errors::Result; -use crate::events::Event; +use crate::events::{BytesText, Event}; use crate::name::{LocalName, NamespaceBindingsIter, NamespaceResolver, QName, ResolveResult}; use crate::reader::{Config, Reader, Span, XmlSource}; @@ -610,6 +610,86 @@ impl NsReader { self.ns_resolver.pop(); Ok(result) } + + /// Reads content between start and end tags, including any markup using + /// provided buffer as intermediate storage for events content. This function + /// is supposed to be called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the _literally_ + /// same name. + /// + /// This method does not unescape read data, instead it returns content + /// "as is" of the XML document. This is because it has no idea what text + /// it reads, and if, for example, it contains CDATA section, attempt to + /// unescape it content will spoil data. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_text()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// # Examples + /// + /// This example shows, how you can read a HTML content from your XML document. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// # use std::borrow::Cow; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::reader::NsReader; + /// + /// let mut reader = NsReader::from_reader(" + /// + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// + /// ".as_bytes()); + /// reader.config_mut().trim_text(true); + /// + /// let start = BytesStart::new("html"); + /// let end = start.to_end().into_owned(); + /// + /// let mut buf = Vec::new(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); + /// // ...and disable checking of end names because we expect HTML further... + /// reader.config_mut().check_end_names = false; + /// + /// // ...then, we could read text content until close tag. + /// // This call will correctly handle nested elements. + /// let text = reader.read_text_into(end.name(), &mut buf).unwrap(); + /// let text = text.decode().unwrap(); + /// assert_eq!(text, r#" + /// This is a HTML text + ///

Usual XML rules does not apply inside it + ///

For example, elements not needed to be "closed" + /// "#); + /// assert!(matches!(text, Cow::Borrowed(_))); + /// + /// // Now we can enable checks again + /// reader.config_mut().check_end_names = true; + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`read_text()`]: Self::read_text() + #[inline] + pub fn read_text_into<'b>( + &mut self, + end: QName, + buf: &'b mut Vec, + ) -> Result> { + // According to the https://www.w3.org/TR/xml11/#dt-etag, end name should + // match literally the start name. See `Self::check_end_names` documentation + let result = self.reader.read_text_into(end, buf)?; + // read_text_into will consume closing tag. Because nobody can access to its + // content anymore, we directly pop namespace of the opening tag + self.ns_resolver.pop(); + Ok(result) + } } impl NsReader> { diff --git a/tests/reader-namespaces.rs b/tests/reader-namespaces.rs index fe7f1e35..962bd200 100644 --- a/tests/reader-namespaces.rs +++ b/tests/reader-namespaces.rs @@ -1547,3 +1547,365 @@ mod read_text { assert_eq!(reader.read_event().unwrap(), Eof); } } + +mod read_text_into { + use super::*; + use pretty_assertions::assert_eq; + + /// Yes, this test contains invalid XML but since we can parse it, we check + /// that it does not break our parser + #[test] + fn decl() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Decl(BytesDecl::new("1.0", None, None)) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + /// Yes, this test contains invalid XML but since we can parse it, we check + /// that it does not break our parser + #[test] + fn doctype() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + DocType(BytesText::new("dtd")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn pi() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + PI(BytesPI::new("pi")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn comment() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Comment(BytesText::new("comment")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn start() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + reader.config_mut().check_end_names = false; + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::new("tag")), + ) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + // NOTE: due to unbalanced XML namespace still not closed + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Empty(BytesStart::new("element")) + ) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn end() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + reader.config_mut().check_end_names = false; + reader.config_mut().allow_unmatched_ends = true; + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Bound(Namespace(b"namespace")), End(BytesEnd::new("tag")),) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn empty() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Empty(BytesStart::new("tag")), + ) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn text() { + let mut reader = NsReader::from_str( + "\ + \ + text\ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Text(BytesText::new("text")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn cdata() { + let mut reader = NsReader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + CData(BytesCData::new("cdata")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } + + #[test] + fn general_ref() { + let mut reader = NsReader::from_str( + "\ + \ + &entity;\ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + ( + Bound(Namespace(b"namespace")), + Start(BytesStart::from_content("root xmlns='namespace'", 4)), + ) + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + GeneralRef(BytesRef::new("entity")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_resolved_event_into(&mut buf).unwrap(), + (Unbound, Empty(BytesStart::new("element"))) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Eof); + } +} diff --git a/tests/reader-read-text.rs b/tests/reader-read-text.rs index 05d8f1de..fe8bb082 100644 --- a/tests/reader-read-text.rs +++ b/tests/reader-read-text.rs @@ -311,3 +311,326 @@ mod borrowed { assert_eq!(reader.read_event().unwrap(), Event::Eof); } } + +mod buffered { + use super::*; + use pretty_assertions::assert_eq; + + /// Yes, this test contains invalid XML but since we can parse it, we check + /// that it does not break our parser + #[test] + fn decl() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Decl(BytesDecl::new("1.0", None, None)) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + /// Yes, this test contains invalid XML but since we can parse it, we check + /// that it does not break our parser + #[test] + fn doctype() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::DocType(BytesText::new("dtd")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn pi() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::PI(BytesPI::new("pi")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn comment() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Comment(BytesText::new("comment")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn start() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + reader.config_mut().check_end_names = false; + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::new("tag")), + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn end() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + reader.config_mut().check_end_names = false; + reader.config_mut().allow_unmatched_ends = true; + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::End(BytesEnd::new("tag")), + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn empty() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("tag")), + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn text() { + let mut reader = Reader::from_str( + "\ + \ + text\ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Text(BytesText::new("text")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn cdata() { + let mut reader = Reader::from_str( + "\ + \ + \ + \ + \ + \ + ", + ); + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::CData(BytesCData::new("cdata")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } + + #[test] + fn dangling_amp() { + let mut reader = Reader::from_str( + "\ + \ + &\ + \ + \ + \ + ", + ); + reader.config_mut().allow_dangling_amp = true; + let mut buf = Vec::new(); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Start(BytesStart::from_content("root", 4)), + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!( + reader.read_text_into(QName(b"root"), &mut buf).unwrap(), + BytesText::from_escaped("") + ); + assert_eq!( + reader.read_event_into(&mut buf).unwrap(), + Event::Empty(BytesStart::new("element")) + ); + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } +}