From a6d486ea5011acb8d8abd04bb0bd2fc0fcca1b82 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 7 Jul 2024 20:40:58 +0500 Subject: [PATCH 1/6] Add tests for `XmlSource::read_text` --- src/reader/mod.rs | 70 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index a05e5bc5..8e8e1342 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -913,13 +913,14 @@ impl Reader { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Result of an attempt to read XML textual data from the reader. +/// Result of an attempt to read XML textual data from the source. +#[derive(Debug)] enum ReadTextResult<'r, B> { - /// Start of markup (`<` character) was found in the first byte. + /// Start of markup (`<` character) was found in the first byte. `<` was consumed. /// Contains buffer that should be returned back to the next iteration cycle /// to satisfy borrow checker requirements. Markup(B), - /// Contains text block up to start of markup (`<` character). + /// Contains text block up to start of markup (`<` character). `<` was consumed. UpToMarkup(&'r [u8]), /// Contains text block up to EOF, start of markup (`<` character) was not found. UpToEof(&'r [u8]), @@ -1518,6 +1519,69 @@ mod test { } } + mod read_text { + use super::*; + use crate::reader::ReadTextResult; + use crate::utils::Bytes; + use pretty_assertions::assert_eq; + + #[$test] + $($async)? fn empty() { + let buf = $buf; + let mut position = 1; + let mut input = b"".as_ref(); + // ^= 1 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"")), + x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x), + } + assert_eq!(position, 1); + } + + #[$test] + $($async)? fn markup() { + let buf = $buf; + let mut position = 1; + let mut input = b"<".as_ref(); + // ^= 2 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::Markup(b) => assert_eq!(b, $buf), + x => panic!("Expected `Markup(_)`, but got `{:?}`", x), + } + assert_eq!(position, 2); + } + + #[$test] + $($async)? fn up_to_markup() { + let buf = $buf; + let mut position = 1; + let mut input = b"a<".as_ref(); + // 1 ^= 3 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")), + x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x), + } + assert_eq!(position, 3); + } + + #[$test] + $($async)? fn up_to_eof() { + let buf = $buf; + let mut position = 1; + let mut input = b"a".as_ref(); + // ^= 2 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")), + x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x), + } + assert_eq!(position, 2); + } + } + mod read_element { use super::*; use crate::errors::{Error, SyntaxError}; From dfea1106696a3d74412eae712876576f24c3da66 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 12 Jun 2024 00:45:52 +0500 Subject: [PATCH 2/6] Introduce a new `Event::GeneralRef` that is emitted on each `&...;` construction in a text failures (16): serde-de (9): borrow::escaped::element borrow::escaped::top_level resolve::resolve_custom_entity trivial::text::byte_buf trivial::text::bytes trivial::text::string::field trivial::text::string::naked trivial::text::string::text xml_schema_lists::element::text::string serde-migrated (1): test_parse_string serde-se (5): with_root::char_amp with_root::char_gt with_root::char_lt with_root::str_escaped with_root::tuple --doc (1): src\de\resolver.rs - de::resolver::EntityResolver (line 13) --- Changelog.md | 13 + fuzz/fuzz_targets/fuzz_target_1.rs | 5 + src/errors.rs | 6 + src/escape.rs | 2 +- src/events/mod.rs | 157 ++++++++- src/reader/async_tokio.rs | 8 +- src/reader/buffered_reader.rs | 101 +++++- src/reader/mod.rs | 215 +++++++++++- src/reader/slice_reader.rs | 66 +++- src/writer.rs | 1 + src/writer/async_tokio.rs | 1 + tests/async-tokio.rs | 15 +- tests/documents/html5.txt | 4 +- tests/html.rs | 11 +- tests/reader-errors.rs | 25 ++ tests/reader-references.rs | 546 +++++++++++++++++++++++++++++ tests/reader.rs | 10 +- 17 files changed, 1151 insertions(+), 35 deletions(-) create mode 100644 tests/reader-references.rs diff --git a/Changelog.md b/Changelog.md index e096aab2..7b6efd9f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -13,12 +13,25 @@ ## Unreleased +### Significant changes + +Now references to entities (as predefined, such as `<`, as user-defined) reported as a new +`Event::GeneralRef`. +Caller can parse the content of the entity and stream events from it as it is required by the +XML specification. + ### New Features +- [#766]: Allow to parse resolved entities as XML fragments and stream events from them. +- [#766]: Added new event `Event::GeneralRef` with content of [general entity]. + ### Bug Fixes ### Misc Changes +[#766]: https://github.com/tafia/quick-xml/pull/766 +[general entity]: https://www.w3.org/TR/xml11/#gen-entity + ## 0.37.0 -- 2024-10-27 diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index 08d983e9..d13e6081 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -55,6 +55,11 @@ where break; } } + Ok(Event::GeneralRef(ref e)) => { + debug_format!(e); + debug_format!(e.is_char_ref()); + debug_format!(e.resolve_char_ref()); + } Ok(Event::PI(ref e)) => { debug_format!(e); } diff --git a/src/errors.rs b/src/errors.rs index 5a15a5ad..f7c7a8c6 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -114,6 +114,9 @@ pub enum IllFormedError { /// [specification]: https://www.w3.org/TR/xml11/#sec-comments /// [configuration]: crate::reader::Config::check_comments DoubleHyphenInComment, + /// The parser started to parse entity or character reference (`&...;`) in text, + /// but the input ended before the closing `;` character was found. + UnclosedReference, } impl fmt::Display for IllFormedError { @@ -144,6 +147,9 @@ impl fmt::Display for IllFormedError { Self::DoubleHyphenInComment => { f.write_str("forbidden string `--` was found in a comment") } + Self::UnclosedReference => f.write_str( + "entity or character reference not closed: `;` not found before end of input", + ), } } } diff --git a/src/escape.rs b/src/escape.rs index 7175ed88..dd0f5f47 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1820,7 +1820,7 @@ pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> { Some(s) } -fn parse_number(num: &str) -> Result { +pub(crate) fn parse_number(num: &str) -> Result { let code = if let Some(hex) = num.strip_prefix('x') { from_str_radix(hex, 16)? } else { diff --git a/src/events/mod.rs b/src/events/mod.rs index 704c4ef6..c274085a 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -48,7 +48,8 @@ use std::str::from_utf8; use crate::encoding::{Decoder, EncodingError}; use crate::errors::{Error, IllFormedError}; use crate::escape::{ - escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with, + escape, minimal_escape, parse_number, partial_escape, resolve_predefined_entity, unescape_with, + EscapeError, }; use crate::name::{LocalName, QName}; #[cfg(feature = "serialize")] @@ -1291,6 +1292,154 @@ impl<'a> arbitrary::Arbitrary<'a> for BytesDecl<'a> { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Character or general entity reference (`Event::GeneralRef`): `&ref;` or `&#;`. +/// +/// This event implements `Deref`. The `deref()` implementation +/// returns the content of this event between `&` and `;`: +/// +/// ``` +/// # use quick_xml::events::{BytesRef, Event}; +/// # use quick_xml::reader::Reader; +/// # use pretty_assertions::assert_eq; +/// let mut reader = Reader::from_str(r#"&entity;"#); +/// let content = "entity"; +/// let event = BytesRef::new(content); +/// +/// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(event.borrow())); +/// // deref coercion of &BytesRef to &[u8] +/// assert_eq!(&event as &[u8], content.as_bytes()); +/// // AsRef<[u8]> for &T + deref coercion +/// assert_eq!(event.as_ref(), content.as_bytes()); +/// ``` +#[derive(Clone, Eq, PartialEq)] +pub struct BytesRef<'a> { + content: Cow<'a, [u8]>, + /// Encoding in which the `content` is stored inside the event. + decoder: Decoder, +} + +impl<'a> BytesRef<'a> { + /// Internal constructor, used by `Reader`. Supplies data in reader's encoding + #[inline] + pub(crate) const fn wrap(content: &'a [u8], decoder: Decoder) -> Self { + Self { + content: Cow::Borrowed(content), + decoder, + } + } + + /// Creates a new `BytesRef` borrowing a slice. + /// + /// # Warning + /// + /// `name` must be a valid name. + #[inline] + pub fn new>>(name: C) -> Self { + Self { + content: str_cow_to_bytes(name), + decoder: Decoder::utf8(), + } + } + + /// Converts the event into an owned event. + pub fn into_owned(self) -> BytesRef<'static> { + BytesRef { + content: Cow::Owned(self.content.into_owned()), + decoder: self.decoder, + } + } + + /// Extracts the inner `Cow` from the `BytesRef` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content + } + + /// Converts the event into a borrowed event. + #[inline] + pub fn borrow(&self) -> BytesRef { + BytesRef { + content: Cow::Borrowed(&self.content), + decoder: self.decoder, + } + } + + /// Decodes the content of the event. + /// + /// This will allocate if the value contains any escape sequences or in + /// non-UTF-8 encoding. + pub fn decode(&self) -> Result, EncodingError> { + self.decoder.decode_cow(&self.content) + } + + /// Returns `true` if the specified reference represents the character reference + /// (`&#;`). + /// + /// ``` + /// # use quick_xml::events::BytesRef; + /// # use pretty_assertions::assert_eq; + /// assert_eq!(BytesRef::new("#x30").is_char_ref(), true); + /// assert_eq!(BytesRef::new("#49" ).is_char_ref(), true); + /// assert_eq!(BytesRef::new("lt" ).is_char_ref(), false); + /// ``` + pub fn is_char_ref(&self) -> bool { + matches!(self.content.first(), Some(b'#')) + } + + /// If this reference represents character reference, then resolves it and + /// returns the character, otherwise returns `None`. + /// + /// This method does not check if character is allowed for XML, in other words, + /// well-formedness constraint [WFC: Legal Char] is not enforced. + /// The character `0x0`, however, will return `EscapeError::InvalidCharRef`. + /// + /// ``` + /// # use quick_xml::events::BytesRef; + /// # use pretty_assertions::assert_eq; + /// assert_eq!(BytesRef::new("#x30").resolve_char_ref().unwrap(), Some('0')); + /// assert_eq!(BytesRef::new("#49" ).resolve_char_ref().unwrap(), Some('1')); + /// assert_eq!(BytesRef::new("lt" ).resolve_char_ref().unwrap(), None); + /// ``` + /// + /// [WFC: Legal Char]: https://www.w3.org/TR/xml11/#wf-Legalchar + pub fn resolve_char_ref(&self) -> Result, Error> { + if let Some(num) = self.decode()?.strip_prefix('#') { + let ch = parse_number(num).map_err(EscapeError::InvalidCharRef)?; + return Ok(Some(ch)); + } + Ok(None) + } +} + +impl<'a> Debug for BytesRef<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "BytesRef {{ content: ")?; + write_cow_string(f, &self.content)?; + write!(f, " }}") + } +} + +impl<'a> Deref for BytesRef<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &self.content + } +} + +#[cfg(feature = "arbitrary")] +impl<'a> arbitrary::Arbitrary<'a> for BytesRef<'a> { + fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { + Ok(Self::new(<&str>::arbitrary(u)?)) + } + + fn size_hint(depth: usize) -> (usize, Option) { + <&str as arbitrary::Arbitrary>::size_hint(depth) + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Event emitted by [`Reader::read_event_into`]. /// /// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into @@ -1315,6 +1464,9 @@ pub enum Event<'a> { PI(BytesPI<'a>), /// Document type definition data (DTD) stored in ``. DocType(BytesText<'a>), + /// General reference `&entity;` in the textual data. Can be either an entity + /// reference, or a character reference. + GeneralRef(BytesRef<'a>), /// End of XML document. Eof, } @@ -1333,6 +1485,7 @@ impl<'a> Event<'a> { Event::Decl(e) => Event::Decl(e.into_owned()), Event::PI(e) => Event::PI(e.into_owned()), Event::DocType(e) => Event::DocType(e.into_owned()), + Event::GeneralRef(e) => Event::GeneralRef(e.into_owned()), Event::Eof => Event::Eof, } } @@ -1350,6 +1503,7 @@ impl<'a> Event<'a> { Event::Decl(e) => Event::Decl(e.borrow()), Event::PI(e) => Event::PI(e.borrow()), Event::DocType(e) => Event::DocType(e.borrow()), + Event::GeneralRef(e) => Event::GeneralRef(e.borrow()), Event::Eof => Event::Eof, } } @@ -1368,6 +1522,7 @@ impl<'a> Deref for Event<'a> { Event::CData(ref e) => e, Event::Comment(ref e) => e, Event::DocType(ref e) => e, + Event::GeneralRef(ref e) => e, Event::Eof => &[], } } diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index ac74e232..a9237de0 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -7,12 +7,14 @@ use std::task::{Context, Poll}; use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf}; -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, IllFormedError, Result, SyntaxError}; +use crate::events::{BytesRef, Event}; use crate::name::{QName, ResolveResult}; use crate::parser::{ElementParser, Parser, PiParser}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{BangType, BinaryStream, NsReader, ParseState, ReadTextResult, Reader, Span}; +use crate::reader::{ + BangType, BinaryStream, NsReader, ParseState, ReadRefResult, ReadTextResult, Reader, Span, +}; use crate::utils::is_whitespace; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 0136a55e..f268448c 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -9,7 +9,7 @@ use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; use crate::parser::Parser; -use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource}; +use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource}; use crate::utils::is_whitespace; macro_rules! impl_buffered_source { @@ -69,17 +69,22 @@ macro_rules! impl_buffered_source { } }; - match memchr::memchr(b'<', available) { + // Search for start of markup or an entity or character reference + match memchr::memchr2(b'<', b'&', available) { // Special handling is needed only on the first iteration. // On next iterations we already read something and should emit Text event - Some(0) if read == 0 => { + Some(0) if read == 0 && available[0] == b'<' => { self $(.$reader)? .consume(1); *position += 1; return ReadTextResult::Markup(buf); } - Some(i) => { + // Do not consume `&` because it may be lone and we would be need to + // return it as part of Text event + Some(0) if read == 0 => return ReadTextResult::Ref(buf), + Some(i) if available[i] == b'<' => { buf.extend_from_slice(&available[..i]); + // +1 to skip `<` let used = i + 1; self $(.$reader)? .consume(used); read += used as u64; @@ -87,6 +92,15 @@ macro_rules! impl_buffered_source { *position += read; return ReadTextResult::UpToMarkup(&buf[start..]); } + Some(i) => { + buf.extend_from_slice(&available[..i]); + + self $(.$reader)? .consume(i); + read += i as u64; + + *position += read; + return ReadTextResult::UpToRef(&buf[start..]); + } None => { buf.extend_from_slice(available); @@ -101,6 +115,85 @@ macro_rules! impl_buffered_source { ReadTextResult::UpToEof(&buf[start..]) } + #[inline] + $($async)? fn read_ref $(<$lf>)? ( + &mut self, + buf: &'b mut Vec, + position: &mut u64, + ) -> ReadRefResult<'b> { + let mut read = 0; + let start = buf.len(); + loop { + let available = match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return ReadRefResult::Err(e); + } + }; + // `read_ref` called when the first character is `&`, so we + // should explicitly skip it at first iteration lest we confuse + // it with the end + if read == 0 { + debug_assert_eq!( + available.first(), + Some(&b'&'), + "`read_ref` must be called at `&`" + ); + // If that ampersand is lone, then it will be part of text + // and we should keep it + buf.push(b'&'); + self $(.$reader)? .consume(1); + read += 1; + continue; + } + + match memchr::memchr3(b';', b'&', b'<', available) { + // Do not consume `&` because it may be lone and we would be need to + // return it as part of Text event + Some(i) if available[i] == b'&' => { + buf.extend_from_slice(&available[..i]); + + self $(.$reader)? .consume(i); + read += i as u64; + + *position += read; + + return ReadRefResult::UpToRef; + } + Some(i) => { + let is_end = available[i] == b';'; + buf.extend_from_slice(&available[..i]); + + // +1 -- skip the end `;` or `<` + let used = i + 1; + self $(.$reader)? .consume(used); + read += used as u64; + + *position += read; + + return if is_end { + ReadRefResult::Ref(&buf[start..]) + } else { + ReadRefResult::UpToMarkup + }; + } + None => { + buf.extend_from_slice(available); + + let used = available.len(); + self $(.$reader)? .consume(used); + read += used as u64; + } + } + } + + *position += read; + ReadRefResult::UpToEof + } + #[inline] $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 8e8e1342..cf806e3e 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -6,8 +6,8 @@ use std::io; use std::ops::Range; use crate::encoding::Decoder; -use crate::errors::{Error, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, IllFormedError, SyntaxError}; +use crate::events::{BytesRef, Event}; use crate::parser::{ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; @@ -232,7 +232,7 @@ macro_rules! read_event_impl { ) => {{ let event = loop { break match $self.state.state { - ParseState::Init => { // Go to InsideMarkup state + ParseState::Init => { // Go to InsideText state // If encoding set explicitly, we not need to detect it. For example, // explicit UTF-8 set automatically if Reader was created using `from_str`. // But we still need to remove BOM for consistency with no encoding @@ -251,6 +251,35 @@ macro_rules! read_event_impl { $self.state.state = ParseState::InsideText; continue; }, + ParseState::InsideRef => { // Go to InsideText + let start = $self.state.offset; + match $reader.read_ref($buf, &mut $self.state.offset) $(.$await)? { + // Emit reference, go to InsideText state + ReadRefResult::Ref(bytes) => { + $self.state.state = ParseState::InsideText; + // +1 to skip start `&` + Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder()))) + } + // Go to Done state + ReadRefResult::UpToEof => { + $self.state.state = ParseState::Done; + $self.state.last_error_offset = start; + Err(Error::IllFormed(IllFormedError::UnclosedReference)) + } + // Do not change state, stay in InsideRef + ReadRefResult::UpToRef => { + $self.state.last_error_offset = start; + Err(Error::IllFormed(IllFormedError::UnclosedReference)) + } + // Go to InsideMarkup state + ReadRefResult::UpToMarkup => { + $self.state.state = ParseState::InsideMarkup; + $self.state.last_error_offset = start; + Err(Error::IllFormed(IllFormedError::UnclosedReference)) + } + ReadRefResult::Err(e) => Err(Error::Io(e.into())), + } + } ParseState::InsideText => { // Go to InsideMarkup or Done state if $self.state.config.trim_text_start { $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?; @@ -263,6 +292,12 @@ macro_rules! read_event_impl { $buf = buf; continue; } + ReadTextResult::Ref(buf) => { + $self.state.state = ParseState::InsideRef; + // Pass `buf` to the next next iteration of parsing loop + $buf = buf; + continue; + } ReadTextResult::UpToMarkup(bytes) => { $self.state.state = ParseState::InsideMarkup; // FIXME: Can produce an empty event if: @@ -271,6 +306,11 @@ macro_rules! read_event_impl { // - trim_text_end = true Ok(Event::Text($self.state.emit_text(bytes))) } + ReadTextResult::UpToRef(bytes) => { + $self.state.state = ParseState::InsideRef; + // Return Text event with `bytes` content or Eof if bytes is empty + Ok(Event::Text($self.state.emit_text(bytes))) + } ReadTextResult::UpToEof(bytes) => { $self.state.state = ParseState::Done; // Trim bytes from end if required @@ -484,6 +524,7 @@ pub type Span = Range; /// Init -- "(no event)"\n --> InsideMarkup /// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText /// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup +/// InsideRef -- "(no event)"\nGeneralRef --> InsideText /// end /// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty /// InsideEmpty -- End --> InsideText @@ -497,6 +538,11 @@ enum ParseState { /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the /// first symbol not `<`, otherwise no event are emitted. Init, + /// State after seeing the `&` symbol in textual content. Depending on the next symbol all other + /// events could be generated. + /// + /// After generating one event the reader moves to the `ClosedTag` state. + InsideRef, /// State after seeing the `<` symbol. Depending on the next symbol all other /// events could be generated. /// @@ -788,7 +834,12 @@ impl Reader { &mut self.reader } - /// Gets the current byte position in the input data. + /// Gets the byte position in the input data just after the last emitted event + /// (i.e. this is position where data of last event ends). + /// + /// Note, that for text events which is originally ended with whitespace characters + /// (` `, `\t`, `\r`, and `\n`) if [`Config::trim_text_end`] is set this is position + /// before trim, not the position of the last byte of the [`Event::Text`] content. pub const fn buffer_position(&self) -> u64 { // when internal state is InsideMarkup, we have actually read until '<', // which we don't want to show @@ -920,14 +971,43 @@ enum ReadTextResult<'r, B> { /// Contains buffer that should be returned back to the next iteration cycle /// to satisfy borrow checker requirements. Markup(B), + /// Start of reference (`&` character) was found in the first byte. + /// `&` was not consumed. + /// Contains buffer that should be returned back to the next iteration cycle + /// to satisfy borrow checker requirements. + Ref(B), /// Contains text block up to start of markup (`<` character). `<` was consumed. UpToMarkup(&'r [u8]), - /// Contains text block up to EOF, start of markup (`<` character) was not found. + /// Contains text block up to start of reference (`&` character). + /// `&` was not consumed. + UpToRef(&'r [u8]), + /// Contains text block up to EOF, neither start of markup (`<` character) + /// or start of reference (`&` character) was found. UpToEof(&'r [u8]), /// IO error occurred. Err(io::Error), } +/// Result of an attempt to read general reference from the reader. +#[derive(Debug)] +enum ReadRefResult<'r> { + /// Contains text block up to end of reference (`;` character). + /// Result includes start `&`, but not end `;`. + Ref(&'r [u8]), + /// Contains text block up to EOF. Neither end of reference (`;`), start of + /// another reference (`&`) or start of markup (`<`) characters was found. + /// Result includes start `&`. + UpToEof, + /// Contains text block up to next possible reference (`&` character). + /// Result includes start `&`. + UpToRef, + /// Contains text block up to start of markup (`<` character). + /// Result includes start `&`. + UpToMarkup, + /// IO error occurred. + Err(io::Error), +} + /// Represents an input for a reader that can return borrowed data. /// /// There are two implementors of this trait: generic one that read data from @@ -951,7 +1031,8 @@ trait XmlSource<'r, B> { #[cfg(feature = "encoding")] fn detect_encoding(&mut self) -> io::Result>; - /// Read input until start of markup (the `<`) is found or end of input is reached. + /// Read input until start of markup (the `<`) is found, start of general entity + /// reference (the `&`) is found or end of input is reached. /// /// # Parameters /// - `buf`: Buffer that could be filled from an input (`Self`) and @@ -961,6 +1042,19 @@ trait XmlSource<'r, B> { /// [events]: crate::events::Event fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>; + /// Read input until end of general reference (the `;`) is found, start of + /// another general reference (the `&`) is found or end of input is reached. + /// + /// This method must be called when current character is `&`. + /// + /// # Parameters + /// - `buf`: Buffer that could be filled from an input (`Self`) and + /// from which [events] could borrow their data + /// - `position`: Will be increased by amount of bytes consumed + /// + /// [events]: crate::events::Event + fn read_ref(&mut self, buf: B, position: &mut u64) -> ReadRefResult<'r>; + /// Read input until processing instruction is finished. /// /// This method expect that start sequence of a parser already was read. @@ -1553,6 +1647,20 @@ mod test { assert_eq!(position, 2); } + #[$test] + $($async)? fn ref_() { + let buf = $buf; + let mut position = 1; + let mut input = b"&".as_ref(); + // ^= 1 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::Ref(b) => assert_eq!(b, $buf), + x => panic!("Expected `Ref(_)`, but got `{:?}`", x), + } + assert_eq!(position, 1); + } + #[$test] $($async)? fn up_to_markup() { let buf = $buf; @@ -1567,6 +1675,20 @@ mod test { assert_eq!(position, 3); } + #[$test] + $($async)? fn up_to_ref() { + let buf = $buf; + let mut position = 1; + let mut input = b"a&".as_ref(); + // ^= 2 + + match $source(&mut input).read_text(buf, &mut position) $(.$await)? { + ReadTextResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"a")), + x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x), + } + assert_eq!(position, 2); + } + #[$test] $($async)? fn up_to_eof() { let buf = $buf; @@ -1582,6 +1704,87 @@ mod test { } } + mod read_ref { + use super::*; + use crate::reader::ReadRefResult; + use crate::utils::Bytes; + use pretty_assertions::assert_eq; + + // Empty input is not allowed for `read_ref` so not tested. + // Borrowed source triggers debug assertion, + // buffered do nothing due to implementation details. + + #[$test] + $($async)? fn up_to_eof() { + let buf = $buf; + let mut position = 1; + let mut input = b"&".as_ref(); + // ^= 2 + + match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { + ReadRefResult::UpToEof => (), + x => panic!("Expected `UpToEof`, but got `{:?}`", x), + } + assert_eq!(position, 2); + } + + #[$test] + $($async)? fn up_to_ref() { + let buf = $buf; + let mut position = 1; + let mut input = b"&&".as_ref(); + // ^= 2 + + match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { + ReadRefResult::UpToRef => (), + x => panic!("Expected `UpToRef`, but got `{:?}`", x), + } + assert_eq!(position, 2); + } + + #[$test] + $($async)? fn up_to_markup() { + let buf = $buf; + let mut position = 1; + let mut input = b"&<".as_ref(); + // ^= 3 + + match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { + ReadRefResult::UpToMarkup => (), + x => panic!("Expected `UpToMarkup`, but got `{:?}`", x), + } + assert_eq!(position, 3); + } + + #[$test] + $($async)? fn empty_ref() { + let buf = $buf; + let mut position = 1; + let mut input = b"&;".as_ref(); + // ^= 3 + + match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { + ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `Ref(_)`, but got `{:?}`", x), + } + assert_eq!(position, 3); + } + + #[$test] + $($async)? fn normal() { + let buf = $buf; + let mut position = 1; + let mut input = b"<".as_ref(); + // ^= 5 + + match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { + ReadRefResult::Ref(bytes) => assert_eq!(Bytes(bytes), Bytes(b"<")), + x => panic!("Expected `Ref(_)`, but got `{:?}`", x), + } + assert_eq!(position, 5); + } + } + mod read_element { use super::*; use crate::errors::{Error, SyntaxError}; diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 08287592..37439597 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -14,7 +14,7 @@ use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; use crate::parser::Parser; -use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource}; +use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource}; use crate::utils::is_whitespace; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. @@ -263,27 +263,79 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { #[inline] fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> { - match memchr::memchr(b'<', self) { - Some(0) => { - *position += 1; + // Search for start of markup or an entity or character reference + match memchr::memchr2(b'<', b'&', self) { + Some(0) if self[0] == b'<' => { *self = &self[1..]; + *position += 1; ReadTextResult::Markup(()) } - Some(i) => { - *position += i as u64 + 1; + // Do not consume `&` because it may be lone and we would be need to + // return it as part of Text event + Some(0) => ReadTextResult::Ref(()), + Some(i) if self[i] == b'<' => { let bytes = &self[..i]; *self = &self[i + 1..]; + *position += i as u64 + 1; ReadTextResult::UpToMarkup(bytes) } + Some(i) => { + let (bytes, rest) = self.split_at(i); + *self = rest; + *position += i as u64; + ReadTextResult::UpToRef(bytes) + } None => { - *position += self.len() as u64; let bytes = &self[..]; *self = &[]; + *position += bytes.len() as u64; ReadTextResult::UpToEof(bytes) } } } + #[inline] + fn read_ref(&mut self, _buf: (), position: &mut u64) -> ReadRefResult<'a> { + debug_assert_eq!( + self.first(), + Some(&b'&'), + "`read_ref` must be called at `&`" + ); + // Search for the end of reference or a start of another reference or a markup + match memchr::memchr3(b';', b'&', b'<', &self[1..]) { + // Do not consume `&` because it may be lone and we would be need to + // return it as part of Text event + Some(i) if self[i + 1] == b'&' => { + let (_, rest) = self.split_at(i + 1); + *self = rest; + *position += i as u64 + 1; + + ReadRefResult::UpToRef + } + Some(i) => { + let end = i + 1; + let is_end = self[end] == b';'; + let bytes = &self[..end]; + // +1 -- skip the end `;` or `<` + *self = &self[end + 1..]; + *position += end as u64 + 1; + + if is_end { + ReadRefResult::Ref(bytes) + } else { + ReadRefResult::UpToMarkup + } + } + None => { + let bytes = &self[..]; + *self = &[]; + *position += bytes.len() as u64; + + ReadRefResult::UpToEof + } + } + } + #[inline] fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]> where diff --git a/src/writer.rs b/src/writer.rs index 19d120bf..f0a6a97d 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -221,6 +221,7 @@ impl Writer { Event::Decl(e) => self.write_wrapped(b""), Event::PI(e) => self.write_wrapped(b""), Event::DocType(e) => self.write_wrapped(b""), + Event::GeneralRef(e) => self.write_wrapped(b"&", &e, b";"), Event::Eof => Ok(()), }; if let Some(i) = self.indent.as_mut() { diff --git a/src/writer/async_tokio.rs b/src/writer/async_tokio.rs index dab4c5b2..4f1c79a6 100644 --- a/src/writer/async_tokio.rs +++ b/src/writer/async_tokio.rs @@ -40,6 +40,7 @@ impl Writer { Event::Decl(e) => self.write_wrapped_async(b"").await, Event::PI(e) => self.write_wrapped_async(b"").await, Event::DocType(e) => self.write_wrapped_async(b"").await, + Event::GeneralRef(e) => self.write_wrapped_async(b"&", &e, b";").await, Event::Eof => Ok(()), }; if let Some(i) = self.indent.as_mut() { diff --git a/tests/async-tokio.rs b/tests/async-tokio.rs index 25ec86bc..94003c0c 100644 --- a/tests/async-tokio.rs +++ b/tests/async-tokio.rs @@ -29,18 +29,19 @@ async fn test_sample() { loop { reads += 1; assert!( - reads <= 5245, + reads <= 10000, "too many events, possible infinity loop: {reads}" ); - match reader.read_event_into_async(&mut buf).await.unwrap() { - Start(_) => count += 1, - Decl(e) => assert_eq!(e.version().unwrap(), b"1.0".as_ref()), - Eof => break, - _ => (), + match reader.read_event_into_async(&mut buf).await { + Ok(Start(_)) => count += 1, + Ok(Decl(e)) => assert_eq!(e.version().unwrap(), b"1.0".as_ref()), + Ok(Eof) => break, + Ok(_) => (), + Err(e) => panic!("{} at {}", e, reader.error_position()), } buf.clear(); } - assert_eq!((count, reads), (1247, 5245)); + assert_eq!((count, reads), (1247, 5457)); } /// This tests checks that read_to_end() correctly returns span even when diff --git a/tests/documents/html5.txt b/tests/documents/html5.txt index 05f200d4..de0a5b43 100644 --- a/tests/documents/html5.txt +++ b/tests/documents/html5.txt @@ -5,6 +5,8 @@ StartElement(a, attr-error: position 7: attribute value must be enclosed in `"` Characters(Hey) EndElement(a) Characters( -  +) +Reference(nbsp) +Characters( ) EndDocument diff --git a/tests/html.rs b/tests/html.rs index 19688064..b93c788a 100644 --- a/tests/html.rs +++ b/tests/html.rs @@ -21,7 +21,12 @@ fn escaped_characters_html() { r#"╔╗╔╗╔╗"#, r#" |StartElement(e [attr="ℏÈℓ𝕝⨀"]) - |Characters(╔╗╔╗╔╗) + |Reference(boxDR) + |Reference(boxDL) + |Reference(#x02554) + |Reference(#x02557) + |Reference(#9556) + |Reference(#9559) |EndElement(e) |EndDocument "#, @@ -86,6 +91,10 @@ fn test_bytes(input: &[u8], output: &[u8], trim: bool) { Ok(c) => format!("Characters({})", &c), Err(err) => format!("FailedUnescape({:?}; {})", e.as_ref(), err), }, + Ok((_, Event::GeneralRef(e))) => match unescape(&decoder.decode(&e).unwrap()) { + Ok(c) => format!("Reference({})", &c), + Err(err) => format!("FailedUnescape({:?}; {})", e.as_ref(), err), + }, Ok((_, Event::Eof)) => "EndDocument".to_string(), Err(e) => format!("Error: {}", e), }; diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index 8f9c578e..0eecca7d 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -896,4 +896,29 @@ mod ill_formed { // ^= 5 err!(double_hyphen_in_comment4("") => 5: IllFormedError::DoubleHyphenInComment); // ^= 5 + + mod reference { + use super::*; + use quick_xml::events::BytesRef; + + err2!(unclosed1(".&") => 1: IllFormedError::UnclosedReference); + err2!(unclosed2(".&x") => 1: IllFormedError::UnclosedReference); + err2!(unclosed_num(".&#") => 1: IllFormedError::UnclosedReference); + err2!(unclosed_dec(".") => 1: IllFormedError::UnclosedReference); + err2!(unclosed_hex1(".&#x") => 1: IllFormedError::UnclosedReference); + err2!(unclosed_hex2(".") => 1: IllFormedError::UnclosedReference); + + // We do not check correctness of references during parsing + ok!(empty("&;") => 2: Event::GeneralRef(BytesRef::new(""))); + ok!(normal1("&x;") => 3: Event::GeneralRef(BytesRef::new("x"))); + ok!(normal2("&x;rest") => 3: Event::GeneralRef(BytesRef::new("x"))); + ok!(num("&#;") => 3: Event::GeneralRef(BytesRef::new("#"))); + ok!(dec("") => 4: Event::GeneralRef(BytesRef::new("#2"))); + ok!(hex1("&#x;") => 4: Event::GeneralRef(BytesRef::new("#x"))); + ok!(hex2("") => 5: Event::GeneralRef(BytesRef::new("#xF"))); + + // XML specification explicitly allowed any number of leading zeroes + ok!(long_dec(" ") => 44: Event::GeneralRef(BytesRef::new("#00000000000000000000000000000000000000032"))); + ok!(long_hex(" ") => 45: Event::GeneralRef(BytesRef::new("#x00000000000000000000000000000000000000020"))); + } } diff --git a/tests/reader-references.rs b/tests/reader-references.rs new file mode 100644 index 00000000..b0f3456e --- /dev/null +++ b/tests/reader-references.rs @@ -0,0 +1,546 @@ +use quick_xml::events::{ + BytesCData, BytesDecl, BytesEnd, BytesPI, BytesRef, BytesStart, BytesText, Event::*, +}; +use quick_xml::reader::Reader; + +use pretty_assertions::assert_eq; + +mod character_reference { + use super::*; + + mod dec { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn decl() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + Decl(BytesDecl::new(&format!("&{i};"), None, None)), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn pi() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + PI(BytesPI::new(&format!("&{i};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn doctype() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + DocType(BytesText::from_escaped(&format!("&{i};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn comment() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + Comment(BytesText::from_escaped(&format!("&{i};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn cdata() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + CData(BytesCData::new(format!("&{i};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn text() { + for i in 0..=0x10FFFF { + let input = format!("&{i};"); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + GeneralRef(BytesRef::new(format!("{i}"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn empty() { + for i in 0u32..=0x10FFFF { + let input = format!("<&{i}; &{i};='&{i};' &{i};=\"&{i};\" &{i};=&{i};/>"); + let mut reader = Reader::from_str(&input); + + let name_len = format!("&{i};").len(); + assert_eq!( + reader.read_event().unwrap(), + Empty(BytesStart::from_content( + format!("&{i}; &{i};='&{i};' &{i};=\"&{i};\" &{i};=&{i};"), + name_len + )), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn start() { + for i in 0..=0x10FFFF { + let input = format!("<&{i}; &{i};='&{i};' &{i};=\"&{i};\" &{i};=&{i};>"); + let mut reader = Reader::from_str(&input); + + let name_len = format!("&{i};").len(); + assert_eq!( + reader.read_event().unwrap(), + Start(BytesStart::from_content( + format!("&{i}; &{i};='&{i};' &{i};=\"&{i};\" &{i};=&{i};"), + name_len + )), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn end() { + for i in 0..=0x10FFFF { + let input = format!("<>"); + let mut reader = Reader::from_str(&input); + reader.config_mut().check_end_names = false; + + // Skip <> + reader.read_event().unwrap(); + assert_eq!( + reader.read_event().unwrap(), + End(BytesEnd::new(format!("&{i};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + } + + mod hex { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn decl() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + Decl(BytesDecl::new(&format!("&#{i:x};"), None, None)), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn pi() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + PI(BytesPI::new(&format!("&#{i:x};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn doctype() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + DocType(BytesText::from_escaped(&format!("&#{i:x};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn comment() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + Comment(BytesText::from_escaped(&format!("&#{i:x};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn cdata() { + for i in 0..=0x10FFFF { + let input = format!(""); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + CData(BytesCData::new(format!("&#{i:x};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn text() { + for i in 0..=0x10FFFF { + let input = format!("&#{i:x};"); + let mut reader = Reader::from_str(&input); + + assert_eq!( + reader.read_event().unwrap(), + GeneralRef(BytesRef::new(format!("#{i:x}"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn empty() { + for i in 0u32..=0x10FFFF { + let input = format!( + "<&#{i:x}; &#{i:x};='&#{i:x};' &#{i:x};=\"&#{i:x};\" &#{i:x};=&#{i:x};/>" + ); + let mut reader = Reader::from_str(&input); + + let name_len = format!("&#{i:x};").len(); + assert_eq!( + reader.read_event().unwrap(), + Empty(BytesStart::from_content( + format!( + "&#{i:x}; &#{i:x};='&#{i:x};' &#{i:x};=\"&#{i:x};\" &#{i:x};=&#{i:x};" + ), + name_len + )), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn start() { + for i in 0..=0x10FFFF { + let input = format!( + "<&#{i:x}; &#{i:x};='&#{i:x};' &#{i:x};=\"&#{i:x};\" &#{i:x};=&#{i:x};>" + ); + let mut reader = Reader::from_str(&input); + + let name_len = format!("&#{i:x};").len(); + assert_eq!( + reader.read_event().unwrap(), + Start(BytesStart::from_content( + format!( + "&#{i:x}; &#{i:x};='&#{i:x};' &#{i:x};=\"&#{i:x};\" &#{i:x};=&#{i:x};" + ), + name_len + )), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + + #[test] + fn end() { + for i in 0..=0x10FFFF { + let input = format!("<>"); + let mut reader = Reader::from_str(&input); + reader.config_mut().check_end_names = false; + + // Skip <> + reader.read_event().unwrap(); + assert_eq!( + reader.read_event().unwrap(), + End(BytesEnd::new(format!("&#{i:x};"))), + "Character reference {i}=0x{i:x}: {input}" + ); + } + } + } +} + +mod general_entity_reference { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn decl() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + Decl(BytesDecl::new("&entity;", None, None)), + ); + } + + #[test] + fn pi() { + let mut reader = Reader::from_str(""); + + assert_eq!(reader.read_event().unwrap(), PI(BytesPI::new("&entity;"))); + } + + #[test] + fn doctype() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + DocType(BytesText::from_escaped("&entity;")), + ); + } + + #[test] + fn comment() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + Comment(BytesText::from_escaped("&entity;")), + ); + } + + #[test] + fn cdata() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + CData(BytesCData::new("&entity;")), + ); + } + + #[test] + fn text() { + let mut reader = Reader::from_str("&entity;"); + + assert_eq!( + reader.read_event().unwrap(), + GeneralRef(BytesRef::new("entity")), + ); + } + + #[test] + fn empty() { + let mut reader = Reader::from_str( + "<&entity; &entity;='&entity;' &entity;=\"&entity;\" &entity;=&entity;/>", + ); + + let name_len = "&entity;".len(); + assert_eq!( + reader.read_event().unwrap(), + Empty(BytesStart::from_content( + "&entity; &entity;='&entity;' &entity;=\"&entity;\" &entity;=&entity;", + name_len + )), + ); + } + + #[test] + fn start() { + let mut reader = Reader::from_str( + "<&entity; &entity;='&entity;' &entity;=\"&entity;\" &entity;=&entity;>", + ); + + let name_len = "&entity;".len(); + assert_eq!( + reader.read_event().unwrap(), + Start(BytesStart::from_content( + "&entity; &entity;='&entity;' &entity;=\"&entity;\" &entity;=&entity;", + name_len + )), + ); + } + + #[test] + fn end() { + let mut reader = Reader::from_str("<>"); + reader.config_mut().check_end_names = false; + + // Skip <> + reader.read_event().unwrap(); + assert_eq!(reader.read_event().unwrap(), End(BytesEnd::new("&entity;"))); + } +} + +/// _Parameter entity references_ are references to entities recognized within DTD. +/// That references recognized [only] inside DTD (`` declaration) and have a +/// form `%name;` (percent sign, name, semicolon). +/// +/// Parameter entities are so-called _parsed entities_, i.e. the content of this +/// reference is a part of DTD and MUST follow DTD grammar after all substitutions. +/// That also means that DTD could be self-modified. +/// +/// In those tests, however, parameter entity references are not recognized. +/// +/// [only]: https://www.w3.org/TR/xml11/#indtd +mod parameter_entity_reference { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn decl() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + Decl(BytesDecl::new("%param;", None, None)), + ); + } + + #[test] + fn pi() { + let mut reader = Reader::from_str(""); + + assert_eq!(reader.read_event().unwrap(), PI(BytesPI::new("%param;"))); + } + + /// Because we do not parse DTD, we do not recognize parameter reference here yet. + /// TODO: Recognize parameter entity references when DTD parsing will be implemented + #[test] + fn doctype() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + DocType(BytesText::from_escaped("%param;")), + ); + } + + /// Comments can be part of DTD, but parameter entity references does not recognized within them. + /// + /// See: + #[test] + fn comment() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + Comment(BytesText::from_escaped("%param;")), + ); + } + + #[test] + fn cdata() { + let mut reader = Reader::from_str(""); + + assert_eq!( + reader.read_event().unwrap(), + CData(BytesCData::new("%param;")), + ); + } + + #[test] + fn text() { + let mut reader = Reader::from_str("%param;"); + + assert_eq!( + reader.read_event().unwrap(), + Text(BytesText::from_escaped("%param;")), + ); + } + + #[test] + fn empty() { + let mut reader = + Reader::from_str("<%param; %param;='%param;' %param;=\"%param;\" %param;=%param;/>"); + + let name_len = "%param;".len(); + assert_eq!( + reader.read_event().unwrap(), + Empty(BytesStart::from_content( + "%param; %param;='%param;' %param;=\"%param;\" %param;=%param;", + name_len + )), + ); + } + + #[test] + fn start() { + let mut reader = + Reader::from_str("<%param; %param;='%param;' %param;=\"%param;\" %param;=%param;>"); + + let name_len = "%param;".len(); + assert_eq!( + reader.read_event().unwrap(), + Start(BytesStart::from_content( + "%param; %param;='%param;' %param;=\"%param;\" %param;=%param;", + name_len + )), + ); + } + + #[test] + fn end() { + let mut reader = Reader::from_str("<>"); + reader.config_mut().check_end_names = false; + + // Skip <> + reader.read_event().unwrap(); + assert_eq!(reader.read_event().unwrap(), End(BytesEnd::new("%param;"))); + } +} + +#[test] +fn mixed_text() { + let input = "text with <&' ' or ' '"; + let mut r = Reader::from_str(input); + + assert_eq!( + r.read_event().unwrap(), + Text(BytesText::from_escaped("text with ")) + ); + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("lt"))); + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("amp"))); + assert_eq!(r.read_event().unwrap(), Text(BytesText::from_escaped("'"))); + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("#32"))); + assert_eq!( + r.read_event().unwrap(), + Text(BytesText::from_escaped("' or '")) + ); + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("#x20"))); + assert_eq!(r.read_event().unwrap(), Text(BytesText::from_escaped("'"))); + assert_eq!(r.read_event().unwrap(), Eof); +} diff --git a/tests/reader.rs b/tests/reader.rs index 2bc27e57..e05166ec 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -1,6 +1,6 @@ use std::str::from_utf8; -use quick_xml::events::{BytesCData, BytesEnd, BytesStart, BytesText, Event::*}; +use quick_xml::events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event::*}; use quick_xml::name::QName; use quick_xml::reader::Reader; @@ -163,16 +163,17 @@ fn test_escaped_content() { let mut r = Reader::from_str("<test>"); assert_eq!(r.read_event().unwrap(), Start(BytesStart::new("a"))); + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("lt"))); match r.read_event() { Ok(Text(e)) => { assert_eq!( &*e, - b"<test>", - "content unexpected: expecting '<test>', got '{:?}'", + b"test", + "content unexpected: expecting 'test', got '{:?}'", from_utf8(&e) ); match e.unescape() { - Ok(c) => assert_eq!(c, ""), + Ok(c) => assert_eq!(c, "test"), Err(e) => panic!( "cannot escape content at position {}: {:?}", r.error_position(), @@ -187,6 +188,7 @@ fn test_escaped_content() { e ), } + assert_eq!(r.read_event().unwrap(), GeneralRef(BytesRef::new("gt"))); assert_eq!(r.read_event().unwrap(), End(BytesEnd::new("a"))); } From 08ec03a28dc8a6c9a188f968a007d1743068494a Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 21 Jun 2024 15:46:07 +0500 Subject: [PATCH 3/6] Update `custom_entities` example to show how to process events from expanded entities --- Changelog.md | 2 +- examples/custom_entities.rs | 243 +++++++++++++++++++++++++++--------- 2 files changed, 188 insertions(+), 57 deletions(-) diff --git a/Changelog.md b/Changelog.md index 7b6efd9f..160ef7b6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -18,7 +18,7 @@ Now references to entities (as predefined, such as `<`, as user-defined) reported as a new `Event::GeneralRef`. Caller can parse the content of the entity and stream events from it as it is required by the -XML specification. +XML specification. See the updated `custom_entities` example! ### New Features diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index 37d172ac..be9d11ea 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -1,5 +1,7 @@ -//! This example demonstrate how custom entities can be extracted from the DOCTYPE!, -//! and later use to decode text and attribute values. +//! This example demonstrate how custom entities can be extracted from the DOCTYPE, +//! and later use to: +//! - insert new pieces of document (particular case - insert only textual content) +//! - decode attribute values //! //! NB: this example is deliberately kept simple: //! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data) @@ -7,70 +9,199 @@ //! * the regex in this example is simple but brittle; //! * it does not support the use of entities in entity declaration. -use std::collections::HashMap; +use std::borrow::Cow; +use std::collections::{HashMap, VecDeque}; +use std::str::from_utf8; -use quick_xml::escape::resolve_predefined_entity; -use quick_xml::events::Event; +use quick_xml::encoding::Decoder; +use quick_xml::errors::Error; +use quick_xml::escape::EscapeError; +use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event}; +use quick_xml::name::QName; use quick_xml::reader::Reader; use regex::bytes::Regex; -const DATA: &str = r#" +use pretty_assertions::assert_eq; - - - ]> - &msg; +struct MyReader<'i> { + /// Stack of readers, the first element is the initial reader, the other are + /// readers created for each resolved entity + readers: VecDeque>, + /// Map of captured internal _parsed general entities_. _Parsed_ means that + /// value of the entity is parsed by XML reader + entities: HashMap<&'i [u8], &'i [u8]>, + /// In this example we use simple regular expression to capture entities from DTD. + /// In real application you should use DTD parser. + entity_re: Regex, +} +impl<'i> MyReader<'i> { + fn new(input: &'i str) -> Result { + let mut reader = Reader::from_str(input); + reader.config_mut().trim_text(true); -"#; + let mut readers = VecDeque::new(); + readers.push_back(reader); -fn main() -> Result<(), Box> { - let mut reader = Reader::from_str(DATA); - reader.config_mut().trim_text(true); - - let mut custom_entities: HashMap = HashMap::new(); - let entity_re = Regex::new(r#""#)?; - - loop { - match reader.read_event() { - Ok(Event::DocType(ref e)) => { - for cap in entity_re.captures_iter(e) { - custom_entities.insert( - reader.decoder().decode(&cap[1])?.into_owned(), - reader.decoder().decode(&cap[2])?.into_owned(), - ); - } - } - Ok(Event::Start(ref e)) => { - if let b"test" = e.name().as_ref() { - let attributes = e - .attributes() - .map(|a| { - a.unwrap() - .decode_and_unescape_value_with(reader.decoder(), |ent| { - custom_entities.get(ent).map(|s| s.as_str()) - }) - .unwrap() - .into_owned() - }) - .collect::>(); - println!("attributes values: {:?}", attributes); + // Capture "name" and "content" from such string: + // + let entity_re = Regex::new(r#""#)?; + Ok(Self { + readers, + entities: HashMap::new(), + entity_re, + }) + } + fn read_event(&mut self) -> Result, Error> { + loop { + if let Some(mut reader) = self.readers.pop_back() { + match dbg!(reader.read_event())? { + // Capture defined entities from the DTD inside document and skip that event + Event::DocType(e) => { + self.readers.push_back(reader); + self.capture(e); + continue; + } + // When entity is referenced, create new reader with the same settings as + // the current reader have and push it to the top of stack. Then try to + // read next event from it (on next iteration) + Event::GeneralRef(e) => { + if let Some(ch) = e.resolve_char_ref()? { + self.readers.push_back(reader); + return Ok(Event::Text(BytesText::from_escaped(ch.to_string()))); + } + let mut r = Reader::from_reader(self.resolve(&e)?); + *r.config_mut() = reader.config().clone(); + + self.readers.push_back(reader); + self.readers.push_back(r); + continue; + } + // When reader is exhausted, do not return it to the stack + Event::Eof => continue, + + // Return all other events to caller + e => { + self.readers.push_back(reader); + return Ok(e); + } } } - Ok(Event::Text(ref e)) => { - println!( - "text value: {}", - e.unescape_with(|ent| match custom_entities.get(ent) { - Some(s) => Some(s.as_str()), - None => resolve_predefined_entity(ent), - }) - .unwrap() - ); - } - Ok(Event::Eof) => break, - Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), - _ => (), + return Ok(Event::Eof); } } + + /// In this example we use simple regular expression to capture entities from DTD. + /// In real application you should use DTD parser + fn capture(&mut self, doctype: BytesText<'i>) { + let doctype = match doctype.into_inner() { + Cow::Borrowed(doctype) => doctype, + Cow::Owned(_) => unreachable!("We are sure that event will be borrowed"), + }; + for cap in self.entity_re.captures_iter(doctype) { + self.entities.insert( + cap.get(1).unwrap().as_bytes(), + cap.get(2).unwrap().as_bytes(), + ); + } + } + + fn resolve(&self, entity: &[u8]) -> Result<&'i [u8], EscapeError> { + match self.entities.get(entity) { + Some(replacement) => Ok(replacement), + None => Err(EscapeError::UnrecognizedEntity( + 0..0, + String::from_utf8_lossy(entity).into_owned(), + )), + } + } + + fn get_entity(&self, entity: &str) -> Option<&'i str> { + self.entities + .get(entity.as_bytes()) + // SAFETY: We are sure that slices are correct UTF-8 because we get + // them from rust string + .map(|value| from_utf8(value).unwrap()) + } + + fn decoder(&self) -> Decoder { + self.readers.back().unwrap().decoder() + } +} + +fn main() -> Result<(), Box> { + let mut reader = MyReader::new( + r#" + + " > + &element1;" > + ]> + '&element2;' + "#, + )?; + + let event = reader.read_event()?; + assert_eq!( + event, + Event::Start(BytesStart::from_content( + r#"test label="Message: &text;""#, + 4 + )) + ); + if let Event::Start(e) = event { + let mut attrs = e.attributes(); + + let label = attrs.next().unwrap()?; + assert_eq!(label.key, QName(b"label")); + assert_eq!( + label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + "Message: hello world" + ); + + assert_eq!(attrs.next(), None); + } + + // This is decoded decimal character reference ' + assert_eq!( + reader.read_event()?, + Event::Text(BytesText::from_escaped("'")) + ); + + //-------------------------------------------------------------------------- + // This part was inserted into original document from entity defined in DTD + + assert_eq!(reader.read_event()?, Event::Start(BytesStart::new("a"))); + let event = reader.read_event()?; + assert_eq!( + event, + Event::Empty(BytesStart::from_content( + r#"dtd attr = 'Message: &text;'"#, + 3 + )) + ); + if let Event::Start(e) = event { + let mut attrs = e.attributes(); + + let attr = attrs.next().unwrap()?; + assert_eq!(attr.key, QName(b"attr")); + assert_eq!( + attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + "Message: hello world" + ); + + assert_eq!(attrs.next(), None); + } + assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("a"))); + //-------------------------------------------------------------------------- + + // This is decoded hexadecimal character reference ' + assert_eq!( + reader.read_event()?, + Event::Text(BytesText::from_escaped("'")) + ); + + assert_eq!(reader.read_event()?, Event::End(BytesEnd::new("test"))); + assert_eq!(reader.read_event()?, Event::Eof); + Ok(()) } From 094a88e7a25171425186be3b3590b512c2cb2efc Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 12 Jun 2024 01:10:51 +0500 Subject: [PATCH 4/6] Replace `BytesText::unescape` and `unescape_with` by `decode` Text events produces by the Reader can not contain escaped data anymore, all such data is represented by the Event::GeneralRef --- Changelog.md | 3 +++ benches/macrobenches.rs | 8 ++++---- benches/microbenches.rs | 2 +- fuzz/fuzz_targets/fuzz_target_1.rs | 2 +- src/de/mod.rs | 6 ++---- src/events/mod.rs | 28 ++++------------------------ src/reader/async_tokio.rs | 6 +++--- src/reader/buffered_reader.rs | 2 +- src/reader/mod.rs | 2 +- src/reader/ns_reader.rs | 8 ++++---- src/reader/slice_reader.rs | 2 +- tests/encodings.rs | 2 +- tests/fuzzing.rs | 2 +- tests/reader.rs | 2 +- tests/roundtrip.rs | 2 +- 15 files changed, 29 insertions(+), 48 deletions(-) diff --git a/Changelog.md b/Changelog.md index 160ef7b6..ca0c854d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -29,6 +29,9 @@ XML specification. See the updated `custom_entities` example! ### Misc Changes +- [#766]: `BytesText::unescape` and `BytesText::unescape_with` replaced by `BytesText::decode`. + Now Text events does not contain escaped parts which are reported as `Event::GeneralRef`. + [#766]: https://github.com/tafia/quick-xml/pull/766 [general entity]: https://www.w3.org/TR/xml11/#gen-entity diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 2b882b12..a89c34e4 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -54,7 +54,7 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { } } Event::Text(e) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); @@ -79,7 +79,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { } } Event::Text(e) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); @@ -105,7 +105,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); criterion::black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { @@ -133,7 +133,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { } } (resolved_ns, Event::Text(e)) => { - criterion::black_box(e.unescape()?); + criterion::black_box(e.decode()?); criterion::black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 2f4ece04..498ad7a2 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -145,7 +145,7 @@ fn one_event(c: &mut Criterion) { config.trim_text(true); config.check_end_names = false; match r.read_event() { - Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(), + Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index d13e6081..dbadfe2f 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -43,7 +43,7 @@ where | Ok(Event::Comment(ref e)) | Ok(Event::DocType(ref e)) => { debug_format!(e); - if let Err(err) = e.unescape() { + if let Err(err) = e.decode() { debug_format!(err); break; } diff --git a/src/de/mod.rs b/src/de/mod.rs index 484c31b0..a5bacfae 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2223,9 +2223,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // FIXME: Actually, we should trim after decoding text, but now we trim before e.inplace_trim_end(); } - result - .to_mut() - .push_str(&e.unescape_with(|entity| self.entity_resolver.resolve(entity))?); + result.to_mut().push_str(&e.decode()?); } PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?), @@ -2247,7 +2245,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // FIXME: Actually, we should trim after decoding text, but now we trim before continue; } - self.drain_text(e.unescape_with(|entity| self.entity_resolver.resolve(entity))?) + self.drain_text(e.decode()?) } PayloadEvent::CData(e) => self.drain_text(e.decode()?), PayloadEvent::DocType(e) => { diff --git a/src/events/mod.rs b/src/events/mod.rs index c274085a..e8b46f15 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -47,10 +47,7 @@ use std::str::from_utf8; use crate::encoding::{Decoder, EncodingError}; use crate::errors::{Error, IllFormedError}; -use crate::escape::{ - escape, minimal_escape, parse_number, partial_escape, resolve_predefined_entity, unescape_with, - EscapeError, -}; +use crate::escape::{escape, minimal_escape, parse_number, partial_escape, EscapeError}; use crate::name::{LocalName, QName}; #[cfg(feature = "serialize")] use crate::utils::CowRef; @@ -580,29 +577,12 @@ impl<'a> BytesText<'a> { } } - /// Decodes then unescapes the content of the event. - /// - /// This will allocate if the value contains any escape sequences or in - /// non-UTF-8 encoding. - pub fn unescape(&self) -> Result, Error> { - self.unescape_with(resolve_predefined_entity) - } - - /// Decodes then unescapes the content of the event with custom entities. + /// Decodes the content of the event. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn unescape_with<'entity>( - &self, - resolve_entity: impl FnMut(&str) -> Option<&'entity str>, - ) -> Result, Error> { - let decoded = self.decoder.decode_cow(&self.content)?; - - match unescape_with(&decoded, resolve_entity)? { - // Because result is borrowed, no replacements was done and we can use original string - Cow::Borrowed(_) => Ok(decoded), - Cow::Owned(s) => Ok(s.into()), - } + pub fn decode(&self) -> Result, EncodingError> { + self.decoder.decode_cow(&self.content) } /// Removes leading XML whitespace bytes from text content. diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index a9237de0..a79ced82 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -103,7 +103,7 @@ impl Reader { /// loop { /// match reader.read_event_into_async(&mut buf).await { /// Ok(Event::Start(_)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), + /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), @@ -237,7 +237,7 @@ impl NsReader { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -373,7 +373,7 @@ impl NsReader { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index f268448c..44930420 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -372,7 +372,7 @@ impl Reader { /// loop { /// match reader.read_event_into(&mut buf) { /// Ok(Event::Start(_)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), + /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), diff --git a/src/reader/mod.rs b/src/reader/mod.rs index cf806e3e..f95327fb 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -718,7 +718,7 @@ where /// _ => (), /// } /// } -/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), +/// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()), /// /// // There are several other `Event`s we do not consider here /// _ => (), diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 07220815..d9f84f62 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -419,7 +419,7 @@ impl NsReader { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -478,7 +478,7 @@ impl NsReader { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), @@ -664,7 +664,7 @@ impl<'i> NsReader<&'i [u8]> { /// } /// } /// Event::Text(e) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -726,7 +726,7 @@ impl<'i> NsReader<&'i [u8]> { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.unescape().unwrap().into_owned()) + /// txt.push(e.decode().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 37439597..c3b501ec 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -62,7 +62,7 @@ impl<'a> Reader<&'a [u8]> { /// loop { /// match reader.read_event().unwrap() { /// Event::Start(e) => count += 1, - /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()), + /// Event::Text(e) => txt.push(e.decode().unwrap().into_owned()), /// Event::Eof => break, /// _ => (), /// } diff --git a/tests/encodings.rs b/tests/encodings.rs index 5f5676fa..7b64e167 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -37,7 +37,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event_into(&mut buf) { Ok(Text(e)) => { - e.unescape().unwrap(); + e.decode().unwrap(); } Ok(Eof) => break, _ => (), diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index 2740763c..25cf6989 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -38,7 +38,7 @@ fn fuzz_101() { } } Ok(Event::Text(e)) => { - if e.unescape().is_err() { + if e.decode().is_err() { break; } } diff --git a/tests/reader.rs b/tests/reader.rs index e05166ec..fecdeabc 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -172,7 +172,7 @@ fn test_escaped_content() { "content unexpected: expecting 'test', got '{:?}'", from_utf8(&e) ); - match e.unescape() { + match e.decode() { Ok(c) => assert_eq!(c, "test"), Err(e) => panic!( "cannot escape content at position {}: {:?}", diff --git a/tests/roundtrip.rs b/tests/roundtrip.rs index 68726195..4fb9ec53 100644 --- a/tests/roundtrip.rs +++ b/tests/roundtrip.rs @@ -236,7 +236,7 @@ fn reescape_text() { match reader.read_event().unwrap() { Eof => break, Text(e) => { - let t = e.unescape().unwrap(); + let t = e.decode().unwrap(); assert!(writer.write_event(Text(BytesText::new(&t))).is_ok()); } e => assert!(writer.write_event(e).is_ok()), From dcc3a6c30defe791ce047975745a02f82725accd Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 12 Jun 2024 01:11:37 +0500 Subject: [PATCH 5/6] Rework entity resolution in serde Deserializer Fixed (18): serde-de (9): borrow::escaped::element borrow::escaped::top_level resolve::resolve_custom_entity trivial::text::byte_buf trivial::text::bytes trivial::text::string::field trivial::text::string::naked trivial::text::string::text xml_schema_lists::element::text::string serde-migrated (1): test_parse_string serde-se (5): with_root::char_amp with_root::char_gt with_root::char_lt with_root::str_escaped with_root::tuple --doc (3): src\de\resolver.rs - de::resolver::EntityResolver (line 13) --- src/de/mod.rs | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index a5bacfae..31ec30cb 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2014,7 +2014,8 @@ use crate::{ de::map::ElementMapAccess, encoding::Decoder, errors::Error, - events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, + escape::{parse_number, EscapeError}, + events::{BytesCData, BytesEnd, BytesRef, BytesStart, BytesText, Event}, name::QName, reader::Reader, utils::CowRef, @@ -2133,6 +2134,8 @@ pub enum PayloadEvent<'a> { CData(BytesCData<'a>), /// Document type definition data (DTD) stored in ``. DocType(BytesText<'a>), + /// Reference `&ref;` in the textual data. + GeneralRef(BytesRef<'a>), /// End of XML document. Eof, } @@ -2147,6 +2150,7 @@ impl<'a> PayloadEvent<'a> { PayloadEvent::Text(e) => PayloadEvent::Text(e.into_owned()), PayloadEvent::CData(e) => PayloadEvent::CData(e.into_owned()), PayloadEvent::DocType(e) => PayloadEvent::DocType(e.into_owned()), + PayloadEvent::GeneralRef(e) => PayloadEvent::GeneralRef(e.into_owned()), PayloadEvent::Eof => PayloadEvent::Eof, } } @@ -2201,7 +2205,7 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { // If next event is a text or CDATA, we should not trim trailing spaces !matches!( self.lookahead, - Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_)) + Ok(PayloadEvent::Text(_)) | Ok(PayloadEvent::CData(_) | PayloadEvent::GeneralRef(_)) ) } @@ -2226,9 +2230,10 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { result.to_mut().push_str(&e.decode()?); } PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?), + PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, - // SAFETY: current_event_is_last_text checks that event is Text or CData - _ => unreachable!("Only `Text` and `CData` events can come here"), + // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef + _ => unreachable!("Only `Text`, `CData` or `GeneralRef` events can come here"), } } Ok(DeEvent::Text(Text { text: result })) @@ -2254,11 +2259,32 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { .map_err(|err| DeError::Custom(format!("cannot parse DTD: {}", err)))?; continue; } + PayloadEvent::GeneralRef(e) => { + let mut text = String::new(); + self.resolve_reference(&mut text, e)?; + self.drain_text(text.into()) + } PayloadEvent::Eof => Ok(DeEvent::Eof), }; } } + fn resolve_reference(&mut self, result: &mut String, event: BytesRef) -> Result<(), DeError> { + let len = event.len(); + let reference = self.decoder().decode(&event)?; + + if let Some(num) = reference.strip_prefix('#') { + let codepoint = parse_number(num).map_err(EscapeError::InvalidCharRef)?; + result.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + return Ok(()); + } + if let Some(value) = self.entity_resolver.resolve(reference.as_ref()) { + result.push_str(value); + return Ok(()); + } + Err(EscapeError::UnrecognizedEntity(0..len, reference.to_string()).into()) + } + #[inline] fn read_to_end(&mut self, name: QName) -> Result<(), DeError> { match self.lookahead { @@ -3027,7 +3053,7 @@ impl StartTrimmer { Event::End(e) => (PayloadEvent::End(e), true), Event::Eof => (PayloadEvent::Eof, true), - // Do not trim next text event after Text or CDATA event + // Do not trim next text event after Text, CDATA or reference event Event::CData(e) => (PayloadEvent::CData(e), false), Event::Text(mut e) => { // If event is empty after trimming, skip it @@ -3036,6 +3062,7 @@ impl StartTrimmer { } (PayloadEvent::Text(e), false) } + Event::GeneralRef(e) => (PayloadEvent::GeneralRef(e), false), _ => return None, }; From 0631d47c5affe7dff4a802358bace9036aedfd6d Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 7 Jul 2024 18:35:31 +0500 Subject: [PATCH 6/6] Add `allow_dangling_amp` configuration option and allow dangling `&` --- Changelog.md | 3 ++ src/reader/buffered_reader.rs | 6 ++-- src/reader/mod.rs | 62 +++++++++++++++++++++++++------- src/reader/slice_reader.rs | 8 ++--- tests/reader-config.rs | 68 ++++++++++++++++++++++++++++++++++- 5 files changed, 127 insertions(+), 20 deletions(-) diff --git a/Changelog.md b/Changelog.md index ca0c854d..3c1f0a00 100644 --- a/Changelog.md +++ b/Changelog.md @@ -24,6 +24,9 @@ XML specification. See the updated `custom_entities` example! - [#766]: Allow to parse resolved entities as XML fragments and stream events from them. - [#766]: Added new event `Event::GeneralRef` with content of [general entity]. +- [#766]: Added new configuration option `allow_dangling_amp` which allows to have + a `&` not followed by `;` in the textual data which is required for some applications + for compatibility reasons. ### Bug Fixes diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 44930420..9b47da34 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -161,7 +161,7 @@ macro_rules! impl_buffered_source { *position += read; - return ReadRefResult::UpToRef; + return ReadRefResult::UpToRef(&buf[start..]); } Some(i) => { let is_end = available[i] == b';'; @@ -177,7 +177,7 @@ macro_rules! impl_buffered_source { return if is_end { ReadRefResult::Ref(&buf[start..]) } else { - ReadRefResult::UpToMarkup + ReadRefResult::UpToMarkup(&buf[start..]) }; } None => { @@ -191,7 +191,7 @@ macro_rules! impl_buffered_source { } *position += read; - ReadRefResult::UpToEof + ReadRefResult::UpToEof(&buf[start..]) } #[inline] diff --git a/src/reader/mod.rs b/src/reader/mod.rs index f95327fb..49a9e249 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -24,6 +24,32 @@ use crate::reader::state::ReaderState; #[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))] #[non_exhaustive] pub struct Config { + /// Whether lone ampersand character (without a paired semicolon) should be + /// allowed in textual content. Unless enabled, in case of a dangling ampersand, + /// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods. + /// + /// Default: `false` + /// + /// # Example + /// + /// ``` + /// # use quick_xml::events::{BytesRef, BytesText, Event}; + /// # use quick_xml::reader::Reader; + /// # use pretty_assertions::assert_eq; + /// let mut reader = Reader::from_str("text with & & & alone"); + /// reader.config_mut().allow_dangling_amp = true; + /// + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with "))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& "))); + /// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp"))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" "))); + /// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone"))); + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference + pub allow_dangling_amp: bool, + /// Whether unmatched closing tag names should be allowed. Unless enabled, /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`] /// is returned from read methods. @@ -210,6 +236,7 @@ impl Config { impl Default for Config { fn default() -> Self { Self { + allow_dangling_amp: false, allow_unmatched_ends: false, check_comments: false, check_end_names: true, @@ -261,18 +288,29 @@ macro_rules! read_event_impl { Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder()))) } // Go to Done state - ReadRefResult::UpToEof => { + ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => { + $self.state.state = ParseState::Done; + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToEof(_) => { $self.state.state = ParseState::Done; $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) } // Do not change state, stay in InsideRef - ReadRefResult::UpToRef => { + ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => { + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToRef(_) => { $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) } // Go to InsideMarkup state - ReadRefResult::UpToMarkup => { + ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => { + $self.state.state = ParseState::InsideMarkup; + Ok(Event::Text($self.state.emit_text(bytes))) + } + ReadRefResult::UpToMarkup(_) => { $self.state.state = ParseState::InsideMarkup; $self.state.last_error_offset = start; Err(Error::IllFormed(IllFormedError::UnclosedReference)) @@ -997,13 +1035,13 @@ enum ReadRefResult<'r> { /// Contains text block up to EOF. Neither end of reference (`;`), start of /// another reference (`&`) or start of markup (`<`) characters was found. /// Result includes start `&`. - UpToEof, + UpToEof(&'r [u8]), /// Contains text block up to next possible reference (`&` character). /// Result includes start `&`. - UpToRef, + UpToRef(&'r [u8]), /// Contains text block up to start of markup (`<` character). /// Result includes start `&`. - UpToMarkup, + UpToMarkup(&'r [u8]), /// IO error occurred. Err(io::Error), } @@ -1722,8 +1760,8 @@ mod test { // ^= 2 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToEof => (), - x => panic!("Expected `UpToEof`, but got `{:?}`", x), + ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x), } assert_eq!(position, 2); } @@ -1736,8 +1774,8 @@ mod test { // ^= 2 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToRef => (), - x => panic!("Expected `UpToRef`, but got `{:?}`", x), + ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x), } assert_eq!(position, 2); } @@ -1750,8 +1788,8 @@ mod test { // ^= 3 match $source(&mut input).read_ref(buf, &mut position) $(.$await)? { - ReadRefResult::UpToMarkup => (), - x => panic!("Expected `UpToMarkup`, but got `{:?}`", x), + ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")), + x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x), } assert_eq!(position, 3); } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index c3b501ec..311edf6a 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -306,11 +306,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { // Do not consume `&` because it may be lone and we would be need to // return it as part of Text event Some(i) if self[i + 1] == b'&' => { - let (_, rest) = self.split_at(i + 1); + let (bytes, rest) = self.split_at(i + 1); *self = rest; *position += i as u64 + 1; - ReadRefResult::UpToRef + ReadRefResult::UpToRef(bytes) } Some(i) => { let end = i + 1; @@ -323,7 +323,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { if is_end { ReadRefResult::Ref(bytes) } else { - ReadRefResult::UpToMarkup + ReadRefResult::UpToMarkup(bytes) } } None => { @@ -331,7 +331,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { *self = &[]; *position += bytes.len() as u64; - ReadRefResult::UpToEof + ReadRefResult::UpToEof(bytes) } } } diff --git a/tests/reader-config.rs b/tests/reader-config.rs index 8796075e..09f820a3 100644 --- a/tests/reader-config.rs +++ b/tests/reader-config.rs @@ -6,9 +6,75 @@ //! Please keep tests sorted (exceptions are allowed if options are tightly related). use quick_xml::errors::{Error, IllFormedError}; -use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText, Event}; +use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesRef, BytesStart, BytesText, Event}; use quick_xml::reader::Reader; +mod allow_dangling_amp { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn false_() { + let mut reader = Reader::from_str("&&<&"); + reader.config_mut().allow_dangling_amp = false; + + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 0..1); + + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 1..2); + + assert_eq!( + reader.read_event().unwrap(), + Event::GeneralRef(BytesRef::new("lt")) + ); + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnclosedReference); + } + x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x), + } + assert_eq!(reader.error_position()..reader.buffer_position(), 6..7); + + assert_eq!(reader.read_event().unwrap(), Event::Eof); + assert_eq!(reader.error_position()..reader.buffer_position(), 6..7); + } + + #[test] + fn true_() { + let mut reader = Reader::from_str("&&<&"); + reader.config_mut().allow_dangling_amp = true; + + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::GeneralRef(BytesRef::new("lt")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::Text(BytesText::from_escaped("&")) + ); + assert_eq!(reader.read_event().unwrap(), Event::Eof); + } +} + mod allow_unmatched_ends { use super::*; use pretty_assertions::assert_eq;