Skip to content

Commit

Permalink
Update the escape / unescape functions to return utf8 types
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jul 15, 2022
1 parent f829d42 commit 97ff3f8
Show file tree
Hide file tree
Showing 16 changed files with 203 additions and 246 deletions.
24 changes: 12 additions & 12 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use quick_xml::Reader;
static SAMPLE: &[u8] = include_bytes!("../tests/documents/sample_rss.xml");
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");

static LOREM_IPSUM_TEXT: &[u8] =
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
static LOREM_IPSUM_TEXT: &str =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
Expand Down Expand Up @@ -299,20 +299,20 @@ fn escaping(c: &mut Criterion) {

group.bench_function("no_chars_to_escape_short", |b| {
b.iter(|| {
criterion::black_box(escape(b"just bit of text"));
criterion::black_box(escape("just bit of text"));
})
});

group.bench_function("escaped_chars_short", |b| {
b.iter(|| {
criterion::black_box(escape(b"age > 72 && age < 21"));
criterion::black_box(escape(b"\"what's that?\""));
criterion::black_box(escape("age > 72 && age < 21"));
criterion::black_box(escape("\"what's that?\""));
})
});

group.bench_function("escaped_chars_long", |b| {
let lorem_ipsum_with_escape_chars =
b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. & Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies tristique nulla aliquet enim tortor at. Fermentum odio eu feugiat pretium nibh ipsum.
Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu cursus
Expand Down Expand Up @@ -345,31 +345,31 @@ fn unescaping(c: &mut Criterion) {

group.bench_function("no_chars_to_unescape_short", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit of text")).unwrap();
criterion::black_box(unescape("just a bit of text")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = b"&#38;&#60;";
let text = "&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = b"&quot;what&apos;s that?&quot;";
let text = "&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("mixed", |b| {
let text =
b"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
"Lorem ipsum dolor sit amet, &amp;consectetur adipiscing elit, sed do eiusmod tempor incididunt
ut labore et dolore magna aliqua. Hac habitasse platea dictumst vestibulum rhoncus est pellentesque.
Risus ultricies &quot;tristique nulla aliquet enim tortor&quot; at. Fermentum odio eu feugiat pretium
nibh ipsum. Volutpat sed cras ornare arcu dui. Scelerisque fermentum dui faucibus in ornare quam. Arcu
Expand Down
17 changes: 7 additions & 10 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let mut custom_entities: HashMap<String, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(
cap[1].to_vec(),
String::from_utf8(cap[1].to_owned())?,
String::from_utf8(cap[2].to_owned())?, // TODO(dalley): this is temporary
);
}
Expand All @@ -46,26 +46,23 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let attributes = e
.attributes()
.map(|a| {
let a = a.unwrap();
let txt = a
a.unwrap()
.unescape_value_with(|ent| {
custom_entities.get(ent).map(|s| s.as_str())
})
.unwrap();
String::from_utf8(txt.to_vec()).unwrap() // TODO(dalley): this is temporary
.unwrap()
.into_owned()
})
.collect::<Vec<String>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
let txt = e
.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str()))
.unwrap();
println!(
"text value: {}",
std::str::from_utf8(txt.as_ref())? // TODO(dalley): this is temporary
e.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str()))
.unwrap()
);
}
Ok(Event::Eof) => break,
Expand Down
18 changes: 8 additions & 10 deletions src/de/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,19 @@ use std::borrow::Cow;
#[derive(Clone, Debug)]
pub struct EscapedDeserializer<'a> {
/// Possible escaped value of text/CDATA or attribute value
escaped_value: Cow<'a, [u8]>,
escaped_value: Cow<'a, str>,
/// If `true`, value requires unescaping before using
escaped: bool,
}

impl<'a> EscapedDeserializer<'a> {
pub fn new(escaped_value: Cow<'a, [u8]>, escaped: bool) -> Self {
pub fn new(escaped_value: Cow<'a, str>, escaped: bool) -> Self {
EscapedDeserializer {
escaped_value,
escaped,
}
}
fn unescaped(&self) -> Result<Cow<[u8]>, DeError> {
fn unescaped(&self) -> Result<Cow<str>, DeError> {
if self.escaped {
unescape(&self.escaped_value).map_err(|e| DeError::InvalidXml(Error::EscapeError(e)))
} else {
Expand All @@ -43,7 +43,7 @@ macro_rules! deserialize_num {
where
V: Visitor<'de>,
{
let value = String::from_utf8(self.escaped_value.as_ref().to_vec())?.parse()?; // TODO(dalley): this is temporary
let value = self.escaped_value.parse()?;

visitor.$visit(value)
}
Expand All @@ -65,17 +65,15 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
V: Visitor<'de>,
{
let unescaped = self.unescaped()?;
let value = String::from_utf8(unescaped.to_vec())?; // TODO(dalley): this is temporary

visitor.visit_str(&value)
visitor.visit_str(&unescaped)
}

fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: Visitor<'de>,
{
let v = self.unescaped()?;
visitor.visit_bytes(&v)
visitor.visit_bytes(&v.as_bytes())
}

fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
Expand All @@ -96,7 +94,7 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
where
V: Visitor<'de>,
{
deserialize_bool(self.escaped_value.as_ref(), visitor)
deserialize_bool(&self.escaped_value.as_bytes(), visitor)
}

fn deserialize_char<V>(self, visitor: V) -> Result<V::Value, Self::Error>
Expand All @@ -117,7 +115,7 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
where
V: Visitor<'de>,
{
if self.escaped_value.as_ref().is_empty() {
if self.escaped_value.is_empty() {
visitor.visit_none()
} else {
visitor.visit_some(self)
Expand Down
11 changes: 7 additions & 4 deletions src/de/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,11 @@ where
// try getting map from attributes (key= "value")
let (key, value) = a.into();
self.source = ValueSource::Attribute(value.unwrap_or_default());
seed.deserialize(EscapedDeserializer::new(Cow::Borrowed(&slice[key]), false))
.map(Some)
seed.deserialize(EscapedDeserializer::new(
Cow::Borrowed(std::str::from_utf8(&slice[key])?),
false,
)) // TODO(dalley): this is temporary
.map(Some)
} else {
// try getting from events (<key>value</key>)
match self.de.peek()? {
Expand Down Expand Up @@ -288,8 +291,8 @@ where
// }
seed.deserialize(self.unflatten_fields.remove(p).into_deserializer())
} else {
let name = Cow::Borrowed(e.local_name().into_inner());
seed.deserialize(EscapedDeserializer::new(name, false))
let name = std::str::from_utf8(e.local_name().into_inner())?; // TODO(dalley): this is temporary
seed.deserialize(EscapedDeserializer::new(Cow::Borrowed(name), false))
};
key.map(Some)
}
Expand Down
21 changes: 9 additions & 12 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ mod tests {
de.write,
vec![
Start(BytesStart::borrowed_name(b"inner")),
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
Start(BytesStart::borrowed_name(b"inner")),
End(BytesEnd::borrowed(b"inner")),
End(BytesEnd::borrowed(b"inner")),
Expand Down Expand Up @@ -1068,7 +1068,7 @@ mod tests {
de.read,
vec![
Start(BytesStart::borrowed_name(b"inner")),
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
Start(BytesStart::borrowed_name(b"inner")),
End(BytesEnd::borrowed(b"inner")),
End(BytesEnd::borrowed(b"inner")),
Expand All @@ -1095,7 +1095,7 @@ mod tests {
vec![
// This comment here to keep the same formatting of both arrays
// otherwise rustfmt suggest one-line it
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
]
);

Expand All @@ -1118,15 +1118,12 @@ mod tests {
assert_eq!(
de.read,
vec![
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
End(BytesEnd::borrowed(b"inner")),
]
);
assert_eq!(de.write, vec![]);
assert_eq!(
de.next().unwrap(),
Text(BytesText::from_escaped_str("text"))
);
assert_eq!(de.next().unwrap(), Text(BytesText::from_escaped("text")));
assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"inner")));
assert_eq!(
de.next().unwrap(),
Expand Down Expand Up @@ -1169,7 +1166,7 @@ mod tests {
de.write,
vec![
Start(BytesStart::borrowed_name(b"skip")),
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
Start(BytesStart::borrowed_name(b"skip")),
End(BytesEnd::borrowed(b"skip")),
End(BytesEnd::borrowed(b"skip")),
Expand All @@ -1193,7 +1190,7 @@ mod tests {
de.write,
vec![
Start(BytesStart::borrowed_name(b"skip")),
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
Start(BytesStart::borrowed_name(b"skip")),
End(BytesEnd::borrowed(b"skip")),
End(BytesEnd::borrowed(b"skip")),
Expand All @@ -1215,7 +1212,7 @@ mod tests {
de.read,
vec![
Start(BytesStart::borrowed_name(b"skip")),
Text(BytesText::from_escaped_str("text")),
Text(BytesText::from_escaped("text")),
Start(BytesStart::borrowed_name(b"skip")),
End(BytesEnd::borrowed(b"skip")),
End(BytesEnd::borrowed(b"skip")),
Expand Down Expand Up @@ -1378,7 +1375,7 @@ mod tests {
br#"item name="hello" source="world.rs""#,
4
)),
Text(BytesText::from_escaped(b"Some text".as_ref())),
Text(BytesText::from_escaped("Some text")),
End(BytesEnd::borrowed(b"item")),
Start(BytesStart::borrowed(b"item2", 5)),
End(BytesEnd::borrowed(b"item2")),
Expand Down
8 changes: 4 additions & 4 deletions src/de/simple_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ impl<'de, 'a> Deserializer<'de> for AtomicDeserializer<'de, 'a> {
V: Visitor<'de>,
{
if self.escaped {
match unescape(self.content.as_str().as_bytes())? {
match unescape(self.content.as_str())? {
Cow::Borrowed(_) => self.content.deserialize_item(visitor),
Cow::Owned(buf) => visitor.visit_string(String::from_utf8(buf)?),
Cow::Owned(buf) => visitor.visit_string(buf),
}
} else {
self.content.deserialize_item(visitor)
Expand Down Expand Up @@ -603,9 +603,9 @@ impl<'de, 'a> Deserializer<'de> for SimpleTypeDeserializer<'de, 'a> {
{
let content = self.decode()?;
if self.escaped {
match unescape(content.as_str().as_bytes())? {
match unescape(content.as_str())? {
Cow::Borrowed(_) => content.deserialize_all(visitor),
Cow::Owned(buf) => visitor.visit_string(String::from_utf8(buf)?),
Cow::Owned(buf) => visitor.visit_string(buf),
}
} else {
content.deserialize_all(visitor)
Expand Down
13 changes: 10 additions & 3 deletions src/de/var.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,18 @@ where
V: DeserializeSeed<'de>,
{
let de = match self.de.peek()? {
DeEvent::Text(t) => EscapedDeserializer::new(Cow::Borrowed(t), true),
DeEvent::Text(t) => {
EscapedDeserializer::new(Cow::Borrowed(std::str::from_utf8(t)?), true)
} // TODO(dalley): temporary
// Escape sequences does not processed inside CDATA section
DeEvent::CData(t) => EscapedDeserializer::new(Cow::Borrowed(t), false),
DeEvent::CData(t) => {
EscapedDeserializer::new(Cow::Borrowed(std::str::from_utf8(t)?), false)
} // TODO(dalley): temporary
DeEvent::Start(e) => {
EscapedDeserializer::new(Cow::Borrowed(e.name().into_inner()), false)
EscapedDeserializer::new(
Cow::Borrowed(std::str::from_utf8(e.name().into_inner())?),
false,
) // TODO(dalley): temporary
}
_ => {
return Err(DeError::Unsupported(
Expand Down
Loading

0 comments on commit 97ff3f8

Please sign in to comment.