Skip to content

Commit 9e1746f

Browse files
committed
Add methods that returns EOL normalized strings in BytesText, BytesCData and BytesRef
1 parent f6dc63b commit 9e1746f

File tree

3 files changed

+156
-2
lines changed

3 files changed

+156
-2
lines changed

Changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,16 @@
2222
- `Deserializer::buffering_with_resolver`
2323
- [#878]: Add ability to serialize structs in `$value` fields. The struct name will
2424
be used as a tag name. Previously only enums was allowed there.
25+
- [#806]: Add `BytesText::xml_content`, `BytesCData::xml_content` and `BytesRef::xml_content`
26+
methods which returns XML EOL normalized strings.
27+
- [#806]: Add `BytesText::html_content`, `BytesCData::html_content` and `BytesRef::html_content`
28+
methods which returns HTML EOL normalized strings.
2529

2630
### Bug Fixes
2731

2832
### Misc Changes
2933

34+
[#806]: https://github.com/tafia/quick-xml/issues/806
3035
[#878]: https://github.com/tafia/quick-xml/pull/878
3136
[#882]: https://github.com/tafia/quick-xml/pull/882
3237

src/encoding.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,30 @@ impl Decoder {
150150
Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
151151
}
152152
}
153+
154+
/// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime
155+
pub(crate) fn content<'b>(
156+
&self,
157+
bytes: &Cow<'b, [u8]>,
158+
normalize_eol: impl Fn(&str) -> Cow<str>,
159+
) -> Result<Cow<'b, str>, EncodingError> {
160+
match bytes {
161+
Cow::Borrowed(bytes) => {
162+
let text = self.decode(bytes)?;
163+
match normalize_eol(&text) {
164+
// If text borrowed after normalization that means that it's not changed
165+
Cow::Borrowed(_) => Ok(text),
166+
Cow::Owned(s) => Ok(Cow::Owned(s)),
167+
}
168+
}
169+
Cow::Owned(bytes) => {
170+
let text = self.decode(bytes)?;
171+
let text = normalize_eol(&text);
172+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
173+
Ok(text.into_owned().into())
174+
}
175+
}
176+
}
153177
}
154178

155179
/// Decodes the provided bytes using the specified encoding.

src/events/mod.rs

Lines changed: 127 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ use std::str::from_utf8;
4848

4949
use crate::encoding::{Decoder, EncodingError};
5050
use crate::errors::{Error, IllFormedError};
51-
use crate::escape::{escape, minimal_escape, parse_number, partial_escape, EscapeError};
51+
use crate::escape::{
52+
escape, minimal_escape, normalize_html_eols, normalize_xml_eols, parse_number, partial_escape,
53+
EscapeError,
54+
};
5255
use crate::name::{LocalName, QName};
5356
use crate::utils::{name_len, trim_xml_end, trim_xml_start, write_cow_string, Bytes};
5457
use attributes::{AttrError, Attribute, Attributes};
@@ -583,6 +586,46 @@ impl<'a> BytesText<'a> {
583586
self.decoder.decode_cow(&self.content)
584587
}
585588

589+
/// Decodes the content of the XML event.
590+
///
591+
/// When this event produced by the reader, it uses the encoding information
592+
/// associated with that reader to interpret the raw bytes contained within
593+
/// this text event.
594+
///
595+
/// This will allocate if the value contains any escape sequences or in non-UTF-8
596+
/// encoding, or EOL normalization is required.
597+
///
598+
/// Note, that this method should be used only if event represents XML content,
599+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
600+
///
601+
/// To get HTML content use [`html_content()`](Self::html_content).
602+
///
603+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
604+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
605+
pub fn xml_content(&self) -> Result<Cow<'a, str>, EncodingError> {
606+
self.decoder.content(&self.content, normalize_xml_eols)
607+
}
608+
609+
/// Decodes the content of the HTML event.
610+
///
611+
/// When this event produced by the reader, it uses the encoding information
612+
/// associated with that reader to interpret the raw bytes contained within
613+
/// this text event.
614+
///
615+
/// This will allocate if the value contains any escape sequences or in non-UTF-8
616+
/// encoding, or EOL normalization is required.
617+
///
618+
/// Note, that this method should be used only if event represents HTML content,
619+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
620+
///
621+
/// To get XML content use [`xml_content()`](Self::xml_content).
622+
///
623+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
624+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
625+
pub fn html_content(&self) -> Result<Cow<'a, str>, EncodingError> {
626+
self.decoder.content(&self.content, normalize_html_eols)
627+
}
628+
586629
/// Removes leading XML whitespace bytes from text content.
587630
///
588631
/// Returns `true` if content is empty after that
@@ -828,7 +871,49 @@ impl<'a> BytesCData<'a> {
828871
/// associated with that reader to interpret the raw bytes contained within this
829872
/// CDATA event.
830873
pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
831-
Ok(self.decoder.decode_cow(&self.content)?)
874+
self.decoder.decode_cow(&self.content)
875+
}
876+
877+
/// Decodes the raw input byte content of the CDATA section of the XML event
878+
/// into a string.
879+
///
880+
/// When this event produced by the reader, it uses the encoding information
881+
/// associated with that reader to interpret the raw bytes contained within
882+
/// this CDATA event.
883+
///
884+
/// This will allocate if the value in non-UTF-8 encoding, or EOL normalization
885+
/// is required.
886+
///
887+
/// Note, that this method should be used only if event represents XML content,
888+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
889+
///
890+
/// To get HTML content use [`html_content()`](Self::html_content).
891+
///
892+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
893+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
894+
pub fn xml_content(&self) -> Result<Cow<'a, str>, EncodingError> {
895+
self.decoder.content(&self.content, normalize_xml_eols)
896+
}
897+
898+
/// Decodes the raw input byte content of the CDATA section of the HTML event
899+
/// into a string.
900+
///
901+
/// When this event produced by the reader, it uses the encoding information
902+
/// associated with that reader to interpret the raw bytes contained within
903+
/// this CDATA event.
904+
///
905+
/// This will allocate if the value in non-UTF-8 encoding, or EOL normalization
906+
/// is required.
907+
///
908+
/// Note, that this method should be used only if event represents HTML content,
909+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
910+
///
911+
/// To get XML content use [`xml_content()`](Self::xml_content).
912+
///
913+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
914+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
915+
pub fn html_content(&self) -> Result<Cow<'a, str>, EncodingError> {
916+
self.decoder.content(&self.content, normalize_html_eols)
832917
}
833918
}
834919

@@ -1443,6 +1528,46 @@ impl<'a> BytesRef<'a> {
14431528
self.decoder.decode_cow(&self.content)
14441529
}
14451530

1531+
/// Decodes the content of the XML event.
1532+
///
1533+
/// When this event produced by the reader, it uses the encoding information
1534+
/// associated with that reader to interpret the raw bytes contained within
1535+
/// this general reference event.
1536+
///
1537+
/// This will allocate if the value in non-UTF-8 encoding, or EOL normalization
1538+
/// is required.
1539+
///
1540+
/// Note, that this method should be used only if event represents XML content,
1541+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
1542+
///
1543+
/// To get HTML content use [`html_content()`](Self::html_content).
1544+
///
1545+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
1546+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
1547+
pub fn xml_content(&self) -> Result<Cow<'a, str>, EncodingError> {
1548+
self.decoder.content(&self.content, normalize_xml_eols)
1549+
}
1550+
1551+
/// Decodes the content of the HTML event.
1552+
///
1553+
/// When this event produced by the reader, it uses the encoding information
1554+
/// associated with that reader to interpret the raw bytes contained within
1555+
/// this general reference event.
1556+
///
1557+
/// This will allocate if the value in non-UTF-8 encoding, or EOL normalization
1558+
/// is required.
1559+
///
1560+
/// Note, that this method should be used only if event represents HTML content,
1561+
/// because rules for normalizing EOLs for [XML] and [HTML] differs.
1562+
///
1563+
/// To get XML content use [`xml_content()`](Self::xml_content).
1564+
///
1565+
/// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends
1566+
/// [HTML]: https://html.spec.whatwg.org/#normalize-newlines
1567+
pub fn html_content(&self) -> Result<Cow<'a, str>, EncodingError> {
1568+
self.decoder.content(&self.content, normalize_html_eols)
1569+
}
1570+
14461571
/// Returns `true` if the specified reference represents the character reference
14471572
/// (`&#<number>;`).
14481573
///

0 commit comments

Comments
 (0)