Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

### New Features

- [#483]: Implement `read_text_into()` and `read_text_into_async()`.

### Bug Fixes

- [#939]: Fix parsing error of the tag from buffered reader, when the first byte `<`
Expand All @@ -24,6 +26,7 @@

### Misc Changes

[#483]: https://github.com/tafia/quick-xml/issues/483
[#936]: https://github.com/tafia/quick-xml/pull/936
[#939]: https://github.com/tafia/quick-xml/issues/939

Expand Down
10 changes: 3 additions & 7 deletions src/parser/dtd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl DtdParser {
/// # Parameters (as same as `reader::BangType::parse`)
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
pub fn feed<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
pub fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
// This method assumes the DTD is well-formed.
// Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs
// is not particularly problematic; the only point of interest is reporting well-formed DTDs
Expand Down Expand Up @@ -102,9 +102,7 @@ impl DtdParser {
}
b'>' => {
*self = Self::Finished;
let len = chunk.len() - cur.len() + i;
// +1 for `>`
return Some((&chunk[..len], len + 1));
return Some(chunk.len() - cur.len() + i);
}
_ => {}
}
Expand Down Expand Up @@ -145,9 +143,7 @@ impl DtdParser {
Self::AfterInternalSubset => {
if let Some(i) = memchr::memchr(b'>', cur) {
*self = Self::Finished;
let len = chunk.len() - cur.len() + i;
// +1 for `>`
return Some((&chunk[..len], len + 1));
return Some(chunk.len() - cur.len() + i);
}
break;
}
Expand Down
153 changes: 151 additions & 2 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::task::{Context, Poll};
use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf};

use crate::errors::{Error, IllFormedError, Result, SyntaxError};
use crate::events::{BytesRef, Event};
use crate::events::{BytesRef, BytesText, Event};
use crate::name::{QName, ResolveResult};
use crate::parser::{ElementParser, Parser, PiParser};
use crate::reader::buffered_reader::impl_buffered_source;
Expand Down Expand Up @@ -180,7 +180,7 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
/// [`Start`]: Event::Start
pub async fn read_to_end_into_async<'n>(
&mut self,
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033`
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033
end: QName<'n>,
buf: &mut Vec<u8>,
) -> Result<Span> {
Expand All @@ -196,6 +196,82 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
))
}

/// An asynchronous version of [`read_text_into()`].
/// Reads asynchronously until end element is found using provided buffer as
/// intermediate storage for events content. This function is supposed to be
/// called after you already read a [`Start`] event.
///
/// See the documentation of [`read_text_into()`] for more information.
///
/// # Examples
///
/// This example shows, how you can read a HTML content from your XML document.
///
/// ```
/// # tokio_test::block_on(async {
/// # use pretty_assertions::assert_eq;
/// # use std::borrow::Cow;
/// use quick_xml::events::{BytesStart, Event};
/// use quick_xml::reader::Reader;
///
/// let mut reader = Reader::from_reader("
/// <html>
/// <title>This is a HTML text</title>
/// <p>Usual XML rules does not apply inside it
/// <p>For example, elements not needed to be &quot;closed&quot;
/// </html>
/// ".as_bytes());
/// reader.config_mut().trim_text(true);
///
/// let start = BytesStart::new("html");
/// let end = start.to_end().into_owned();
///
/// let mut buf = Vec::new();
///
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
/// // ...and disable checking of end names because we expect HTML further...
/// reader.config_mut().check_end_names = false;
///
/// // ...then, we could read text content until close tag.
/// // This call will correctly handle nested <html> elements.
/// let text = reader.read_text_into_async(end.name(), &mut buf).await.unwrap();
/// let text = text.decode().unwrap();
/// assert_eq!(text, r#"
/// <title>This is a HTML text</title>
/// <p>Usual XML rules does not apply inside it
/// <p>For example, elements not needed to be &quot;closed&quot;
/// "#);
/// assert!(matches!(text, Cow::Borrowed(_)));
///
/// // Now we can enable checks again
/// reader.config_mut().check_end_names = true;
///
/// // At the end we should get an Eof event, because we ate the whole XML
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Eof);
/// # }) // tokio_test::block_on
/// ```
///
/// [`read_text_into()`]: Self::read_text_into
/// [`Start`]: Event::Start
pub async fn read_text_into_async<'n, 'b>(
&mut self,
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033
end: QName<'n>,
buf: &'b mut Vec<u8>,
) -> Result<BytesText<'b>> {
let start = buf.len();
let span = read_to_end!(self, end, buf, read_event_into_async, {}, await);

let len = span.end - span.start;
// SAFETY: `buf` may contain not more than isize::MAX bytes and because it is
// not cleared when reading event, length of the returned span should fit into
// usize (because otherwise we panic at appending to the buffer before that point)
let end = start + len as usize;

Ok(BytesText::wrap(&buf[start..end], self.decoder()))
}

/// Private function to read until `>` is found. This function expects that
/// it was called just after encounter a `<` symbol.
async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
Expand Down Expand Up @@ -344,6 +420,79 @@ impl<R: AsyncBufRead + Unpin> NsReader<R> {
Ok(result)
}

/// An asynchronous version of [`read_text_into()`].
/// Reads asynchronously until end element is found using provided buffer as
/// intermediate storage for events content. This function is supposed to be
/// called after you already read a [`Start`] event.
///
/// See the documentation of [`read_text_into()`] for more information.
///
/// # Examples
///
/// This example shows, how you can read a HTML content from your XML document.
///
/// ```
/// # tokio_test::block_on(async {
/// # use pretty_assertions::assert_eq;
/// # use std::borrow::Cow;
/// use quick_xml::events::{BytesStart, Event};
/// use quick_xml::reader::NsReader;
///
/// let mut reader = NsReader::from_reader("
/// <html>
/// <title>This is a HTML text</title>
/// <p>Usual XML rules does not apply inside it
/// <p>For example, elements not needed to be &quot;closed&quot;
/// </html>
/// ".as_bytes());
/// reader.config_mut().trim_text(true);
///
/// let start = BytesStart::new("html");
/// let end = start.to_end().into_owned();
///
/// let mut buf = Vec::new();
///
/// // First, we read a start event...
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start));
/// // ...and disable checking of end names because we expect HTML further...
/// reader.config_mut().check_end_names = false;
///
/// // ...then, we could read text content until close tag.
/// // This call will correctly handle nested <html> elements.
/// let text = reader.read_text_into_async(end.name(), &mut buf).await.unwrap();
/// let text = text.decode().unwrap();
/// assert_eq!(text, r#"
/// <title>This is a HTML text</title>
/// <p>Usual XML rules does not apply inside it
/// <p>For example, elements not needed to be &quot;closed&quot;
/// "#);
/// assert!(matches!(text, Cow::Borrowed(_)));
///
/// // Now we can enable checks again
/// reader.config_mut().check_end_names = true;
///
/// // At the end we should get an Eof event, because we ate the whole XML
/// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Eof);
/// # }) // tokio_test::block_on
/// ```
///
/// [`read_text_into()`]: Self::read_text_into
/// [`Start`]: Event::Start
pub async fn read_text_into_async<'n, 'b>(
&mut self,
// We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033
end: QName<'n>,
buf: &'b mut Vec<u8>,
) -> Result<BytesText<'b>> {
// According to the https://www.w3.org/TR/xml11/#dt-etag, end name should
// match literally the start name. See `Config::check_end_names` documentation
let result = self.reader.read_text_into_async(end, buf).await?;
// read_text_into_async will consume closing tag. Because nobody can access to its
// content anymore, we directly pop namespace of the opening tag
self.ns_resolver.pop();
Ok(result)
}

/// An asynchronous version of [`read_resolved_event_into()`]. Reads the next
/// event into given buffer asynchronously and resolves its namespace (if applicable).
///
Expand Down
Loading