From 1bbb15d3702b058cad2775639669e9a38847374d Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 23 Sep 2023 01:01:19 +0100 Subject: [PATCH 1/5] Fix doctype causing wrong whitespace emitted --- src/reader/parser/inside_doctype.rs | 6 +++--- src/reader/parser/outside_tag.rs | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs index 4508c287..abf897c1 100644 --- a/src/reader/parser/inside_doctype.rs +++ b/src/reader/parser/inside_doctype.rs @@ -51,12 +51,12 @@ impl PullParser { None }, Token::Character(c) if is_whitespace_char(c) => { - match self.buf.as_str() { + let buf = self.take_buf(); + match buf.as_str() { "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), - s => Some(self.error(SyntaxError::UnknownMarkupDeclaration(s.into()))), + _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))), } - }, _ => Some(self.error(SyntaxError::UnexpectedToken(t))), }, diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs index 8ab13646..4eaad0a9 100644 --- a/src/reader/parser/outside_tag.rs +++ b/src/reader/parser/outside_tag.rs @@ -93,6 +93,7 @@ impl PullParser { if self.inside_whitespace && self.config.c.trim_whitespace { None } else if self.inside_whitespace && !self.config.c.whitespace_to_characters { + debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}"); Some(Ok(XmlEvent::Whitespace(buf))) } else if self.config.c.trim_whitespace { Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) From f8f8a3a67ee6e703b2925ed4389ebddbc3d9a35c Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 23 Sep 2023 01:02:25 +0100 Subject: [PATCH 2/5] Namespace.borrow for consistency --- src/namespace.rs | 6 ++++++ src/reader/events.rs | 4 +--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/namespace.rs b/src/namespace.rs index 216a982d..96976c31 100644 --- a/src/namespace.rs +++ b/src/namespace.rs @@ -1,5 +1,6 @@ //! Contains namespace manipulation types and functions. +use std::borrow::Cow; use std::collections::btree_map::Iter as Entries; use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; @@ -165,6 +166,11 @@ impl Namespace { pub fn get<'a, P: ?Sized + AsRef>(&'a self, prefix: &P) -> Option<&'a str> { self.0.get(prefix.as_ref()).map(|s| &**s) } + + /// Borrowed namespace for the writer + pub fn borrow(&self) -> Cow<'_, Self> { + Cow::Borrowed(self) + } } /// An alias for iterator type for namespace mappings contained in a namespace. diff --git a/src/reader/events.rs b/src/reader/events.rs index de2b930e..e8eb81e3 100644 --- a/src/reader/events.rs +++ b/src/reader/events.rs @@ -1,8 +1,6 @@ //! Contains `XmlEvent` datatype, instances of which are emitted by the parser. -use std::borrow::Cow; use std::fmt; - use crate::attribute::OwnedAttribute; use crate::common::XmlVersion; use crate::name::OwnedName; @@ -207,7 +205,7 @@ impl XmlEvent { Some(crate::writer::events::XmlEvent::StartElement { name: name.borrow(), attributes: attributes.iter().map(|a| a.borrow()).collect(), - namespace: Cow::Borrowed(namespace) + namespace: namespace.borrow(), }), XmlEvent::EndElement { ref name } => Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), From 1c2dc5bad03cbb9d1d16d8cf208886acdebb9afa Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 23 Sep 2023 01:05:30 +0100 Subject: [PATCH 3/5] Document rewriting example --- examples/rewrite.rs | 68 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 examples/rewrite.rs diff --git a/examples/rewrite.rs b/examples/rewrite.rs new file mode 100644 index 00000000..8e4b967b --- /dev/null +++ b/examples/rewrite.rs @@ -0,0 +1,68 @@ +//! See for a real-world example. + +use xml::EmitterConfig; +use std::fs::File; +use std::io::BufReader; +use std::path::Path; +use xml::reader::{ParserConfig, Result}; + +fn main() -> Result<(), Box> { + let arg = std::env::args_os().nth(1); + let file_path = Path::new(arg.as_deref().unwrap_or("tests/documents/sample_1.xml".as_ref())); + let file = BufReader::new(File::open(file_path) + .map_err(|e| format!("Can't open {}: {e}", file_path.display()))?); + + let mut reader = ParserConfig::default() + .ignore_root_level_whitespace(true) + .ignore_comments(false) + .cdata_to_characters(true) + .coalesce_characters(true) + .create_reader(file); + + let stdout = std::io::stdout().lock(); + + let mut writer = EmitterConfig::default() + .create_writer(stdout); + + loop { + let reader_event = reader.next()?; + + match reader_event { + xml::reader::XmlEvent::EndDocument => break, + xml::reader::XmlEvent::StartElement { name, mut attributes, namespace } => { + let event = xml::writer::XmlEvent::StartElement { + name: name.borrow(), + namespace: namespace.borrow(), + attributes: attributes.iter_mut().map(|attr| { + attr.value = alternating_caps(&attr.value); + attr.borrow() + }).collect(), + }; + writer.write(event)?; + }, + xml::reader::XmlEvent::Characters(text) => { + let text = alternating_caps(&text); + let event = xml::writer::XmlEvent::Characters(&text); + writer.write(event)?; + }, + xml::reader::XmlEvent::Comment(text) => { + let text = alternating_caps(&text); + let event = xml::writer::XmlEvent::Comment(&text); + writer.write(event)?; + }, + other => { + if let Some(writer_event) = other.as_writer_event() { + writer.write(writer_event)?; + } + } + } + + } + Ok(()) +} + +fn alternating_caps(text: &str) -> String { + text.chars().enumerate() + .map(|(i, ch)| if i&1==0 { ch.to_ascii_uppercase() } else { ch.to_ascii_lowercase() }) + .collect() +} From ab794a72863feb64af8422d99ac763895f51f69a Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 23 Sep 2023 01:13:59 +0100 Subject: [PATCH 4/5] Clippy + fmt --- examples/print_events.rs | 2 +- src/attribute.rs | 2 +- src/common.rs | 6 +++--- src/escape.rs | 17 +++++++++-------- src/macros.rs | 4 ++-- src/namespace.rs | 1 + src/reader/indexset.rs | 8 ++++---- src/reader/parser.rs | 13 ++++++------- src/reader/parser/inside_doctype.rs | 4 ++-- src/reader/parser/inside_opening_tag.rs | 10 ++++++---- src/reader/parser/outside_tag.rs | 2 +- src/writer/emitter.rs | 3 +-- tests/event_writer.rs | 2 +- tests/streaming.rs | 2 +- 14 files changed, 39 insertions(+), 37 deletions(-) diff --git a/examples/print_events.rs b/examples/print_events.rs index eaef7d81..79dca162 100644 --- a/examples/print_events.rs +++ b/examples/print_events.rs @@ -1,7 +1,7 @@ use std::fs::File; use std::io::BufReader; use xml::common::Position; -use xml::reader::*; +use xml::reader::{ParserConfig, XmlEvent}; fn main() { let file_path = std::env::args_os().nth(1).expect("Please specify a path to an XML file"); diff --git a/src/attribute.rs b/src/attribute.rs index 112bf247..5d0184e3 100644 --- a/src/attribute.rs +++ b/src/attribute.rs @@ -3,7 +3,7 @@ use std::fmt; -use crate::escape::{Escaped, AttributeEscapes}; +use crate::escape::{AttributeEscapes, Escaped}; use crate::name::{Name, OwnedName}; /// A borrowed version of an XML attribute. diff --git a/src/common.rs b/src/common.rs index a1bf3ac1..0b324f26 100644 --- a/src/common.rs +++ b/src/common.rs @@ -112,15 +112,15 @@ pub fn is_whitespace_str(s: &str) -> bool { s.chars().all(is_whitespace_char) } -pub fn is_xml10_char(c: char) -> bool { +#[must_use] pub fn is_xml10_char(c: char) -> bool { matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } -pub fn is_xml11_char(c: char) -> bool { +#[must_use] pub fn is_xml11_char(c: char) -> bool { matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..) } -pub fn is_xml11_char_not_restricted(c: char) -> bool { +#[must_use] pub fn is_xml11_char_not_restricted(c: char) -> bool { is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}') } diff --git a/src/escape.rs b/src/escape.rs index 1fcfd06f..ad8ee4a9 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -5,11 +5,11 @@ use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}}; pub(crate) trait Escapes { fn escape(c: u8) -> Option<&'static str>; - fn byte_needs_escaping(c: u8) -> bool{ + fn byte_needs_escaping(c: u8) -> bool { Self::escape(c).is_some() } - fn str_needs_escaping(s: &str) -> bool{ + fn str_needs_escaping(s: &str) -> bool { s.bytes().any(|c| Self::escape(c).is_some()) } } @@ -22,13 +22,12 @@ pub(crate) struct Escaped<'a, E: Escapes> { impl<'a, E: Escapes> Escaped<'a, E> { pub fn new(s: &'a str) -> Self { Escaped { - _escape_phantom: PhantomData, + _escape_phantom: PhantomData, to_escape: s, } } } - impl<'a, E: Escapes> Display for Escaped<'a, E> { fn fmt(&self, f: &mut Formatter<'_>) -> Result { let mut total_remaining = self.to_escape; @@ -49,7 +48,7 @@ impl<'a, E: Escapes> Display for Escaped<'a, E> { total_remaining = &remaining[1..]; } - + f.write_str(total_remaining) } } @@ -107,7 +106,7 @@ escapes!( /// * `"` → `"` /// * `'` → `'` /// * `&` → `&` -/// +/// /// The following characters are escaped so that attributes are printed on /// a single line: /// * `\n` → ` ` @@ -117,7 +116,8 @@ escapes!( /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -#[must_use] pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { +#[must_use] +pub fn escape_str_attribute(s: &str) -> Cow<'_, str> { escape_str::(s) } @@ -133,7 +133,8 @@ escapes!( /// /// Does not perform allocations if the given string does not contain escapable characters. #[inline] -#[must_use] pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { +#[must_use] +pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> { escape_str::(s) } diff --git a/src/macros.rs b/src/macros.rs index eab291f6..da1adade 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -19,7 +19,7 @@ macro_rules! gen_setter { /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] - pub fn $field(mut self, value: $t) -> Self { + #[must_use] pub fn $field(mut self, value: $t) -> Self { self.$field = value; self } @@ -29,7 +29,7 @@ macro_rules! gen_setter { /// /// See [`ParserConfig`][crate::ParserConfig] fields docs for details #[inline] - pub fn $field(mut self, value: $t) -> Self { + #[must_use] pub fn $field(mut self, value: $t) -> Self { self.c.$field = value; self } diff --git a/src/namespace.rs b/src/namespace.rs index 96976c31..02f3b3df 100644 --- a/src/namespace.rs +++ b/src/namespace.rs @@ -168,6 +168,7 @@ impl Namespace { } /// Borrowed namespace for the writer + #[must_use] pub fn borrow(&self) -> Cow<'_, Self> { Cow::Borrowed(self) } diff --git a/src/reader/indexset.rs b/src/reader/indexset.rs index 4ce633f8..3d683a28 100644 --- a/src/reader/indexset.rs +++ b/src/reader/indexset.rs @@ -84,13 +84,13 @@ fn indexset() { } assert!(s.contains(&OwnedName { - local_name: format!("attr1234"), namespace: None, prefix: None, + local_name: "attr1234".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { - local_name: format!("attr0"), namespace: None, prefix: None, + local_name: "attr0".into(), namespace: None, prefix: None, })); assert!(s.contains(&OwnedName { - local_name: format!("attr49999"), namespace: None, prefix: None, + local_name: "attr49999".into(), namespace: None, prefix: None, })); } @@ -100,7 +100,7 @@ struct U64Hasher(u64); impl Hasher for U64Hasher { fn finish(&self) -> u64 { self.0 } fn write(&mut self, slice: &[u8]) { - for &v in slice { self.0 ^= v as u64 } // unused in practice + for &v in slice { self.0 ^= u64::from(v) } // unused in practice } fn write_u64(&mut self, i: u64) { self.0 ^= i; diff --git a/src/reader/parser.rs b/src/reader/parser.rs index ca831e84..18f073d7 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -396,7 +396,7 @@ impl PullParser { fn next_pos(&mut self) { // unfortunately calls to next_pos will never be perfectly balanced with push_pos, // at very least because parse errors and EOF can happen unexpectedly without a prior push. - if self.pos.len() > 0 { + if !self.pos.is_empty() { if self.pos.len() > 1 { self.pos.remove(0); } else { @@ -485,7 +485,7 @@ impl PullParser { let name = this.take_buf(); match name.parse() { Ok(name) => on_name(this, t, name), - Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))) + Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), } }; @@ -515,7 +515,7 @@ impl PullParser { Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), - _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))) + _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), } } @@ -527,7 +527,7 @@ impl PullParser { fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option where F: Fn(&mut PullParser, String) -> Option { match t { - Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace + Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace Token::DoubleQuote | Token::SingleQuote => match self.data.quote { None => { // Entered attribute value @@ -558,8 +558,7 @@ impl PullParser { self.into_state_continue(State::InsideReference) }, - Token::OpeningTagStart => - Some(self.error(SyntaxError::UnexpectedOpeningTag)), + Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)), Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) @@ -584,7 +583,7 @@ impl PullParser { // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { - Some("") => name.namespace = None, // default namespace + Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs index abf897c1..87595d65 100644 --- a/src/reader/parser/inside_doctype.rs +++ b/src/reader/parser/inside_doctype.rs @@ -31,8 +31,8 @@ impl PullParser { _ => None, }, DoctypeSubstate::String => match t { - Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None }, - Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None }, + Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None, + Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None, Token::SingleQuote | Token::DoubleQuote => { self.data.quote = None; self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs index 0ebc58d1..fb6d0017 100644 --- a/src/reader/parser/inside_opening_tag.rs +++ b/src/reader/parser/inside_opening_tag.rs @@ -31,7 +31,7 @@ impl PullParser { OpeningTagSubstate::InsideTag => match t { Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), - Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace + Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace Token::Character(c) if is_name_start_char(c) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); @@ -39,7 +39,7 @@ impl PullParser { self.buf.push(c); self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) } - _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { @@ -108,10 +108,12 @@ impl PullParser { }), OpeningTagSubstate::AfterAttributeValue => match t { - Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + Token::Character(c) if is_whitespace_char(c) => { + self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + }, Token::TagEnd => self.emit_start_element(false), Token::EmptyTagEnd => self.emit_start_element(true), - _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) + _ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))), }, } } diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs index 4eaad0a9..e62f8620 100644 --- a/src/reader/parser/outside_tag.rs +++ b/src/reader/parser/outside_tag.rs @@ -175,7 +175,7 @@ impl PullParser { self.into_state(State::OutsideTag, next_event) }, - Token::CommentStart => { + Token::CommentStart => { let next_event = self.set_encountered(Encountered::Comment); self.into_state(State::InsideComment, next_event) } diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs index 8e74b5f2..431d48c0 100644 --- a/src/writer/emitter.rs +++ b/src/writer/emitter.rs @@ -390,8 +390,7 @@ impl Emitter { } } - pub fn emit_characters(&mut self, target: &mut W, - content: &str) -> Result<()> { + pub fn emit_characters(&mut self, target: &mut W, content: &str) -> Result<()> { self.check_document_started(target)?; self.fix_non_empty_element(target)?; diff --git a/tests/event_writer.rs b/tests/event_writer.rs index 2ca4b8a0..90ec1c13 100644 --- a/tests/event_writer.rs +++ b/tests/event_writer.rs @@ -31,7 +31,7 @@ fn reading_writing_equal_with_namespaces() { Err(e) => panic!("Writer error: {e:?}") } }, - Err(e) => panic!("Error: {e}") + Err(e) => panic!("Error: {e}"), } } } diff --git a/tests/streaming.rs b/tests/streaming.rs index b53c0315..6a21afb8 100644 --- a/tests/streaming.rs +++ b/tests/streaming.rs @@ -21,7 +21,7 @@ macro_rules! assert_match { (left: `{:?}`, right: `{} if {}`", $actual, stringify!($expected), stringify!($guard)) } - } + }; } fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write { From bfb185ede18170f7b21f9b17ab65cbb4aba2de22 Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 23 Sep 2023 01:15:33 +0100 Subject: [PATCH 5/5] Bump --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index bacd3ad0..264cf008 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "xml-rs" -version = "0.8.18" +version = "0.8.19" authors = ["Vladimir Matveev "] license = "MIT" description = "An XML library in pure Rust"