Skip to content

Commit 535914e

Browse files
committed
Forbid whitespace before XML prolog
1 parent 9ca52b9 commit 535914e

File tree

6 files changed

+61
-12
lines changed

6 files changed

+61
-12
lines changed

src/reader/parser.rs

+9-6
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,11 @@ pub(crate) struct PullParser {
9090
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
9191
enum Encountered {
9292
None = 0,
93-
Declaration = 1,
94-
Comment = 2,
95-
Doctype = 3,
96-
Element = 4,
93+
AnyChars, // whitespace before <?xml is not allowed
94+
Declaration,
95+
Comment,
96+
Doctype,
97+
Element,
9798
}
9899

99100
impl PullParser {
@@ -117,7 +118,7 @@ impl PullParser {
117118
PullParser {
118119
config,
119120
lexer,
120-
st: State::OutsideTag,
121+
st: State::DocumentStart,
121122
state_after_reference: State::OutsideTag,
122123
buf: String::new(),
123124
entities: HashMap::new(),
@@ -159,7 +160,7 @@ impl PullParser {
159160

160161
// If declaration was not parsed and we have encountered an element,
161162
// emit this declaration as the next event.
162-
if prev_enc < Encountered::Declaration {
163+
if prev_enc == Encountered::None {
163164
self.push_pos();
164165
Some(Ok(XmlEvent::StartDocument {
165166
version: DEFAULT_VERSION,
@@ -191,6 +192,7 @@ pub enum State {
191192
InsideDeclaration(DeclarationSubstate),
192193
InsideDoctype(DoctypeSubstate),
193194
InsideReference,
195+
DocumentStart,
194196
}
195197

196198
#[derive(Copy, Clone, PartialEq)]
@@ -423,6 +425,7 @@ impl PullParser {
423425
State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
424426
State::InsideDoctype(s) => self.inside_doctype(t, s),
425427
State::InsideDeclaration(s) => self.inside_declaration(t, s),
428+
State::DocumentStart => self.document_start(t),
426429
}
427430
}
428431

src/reader/parser/outside_tag.rs

+51-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ impl PullParser {
128128
Token::ProcessingInstructionStart =>
129129
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
130130

131-
132131
Token::CDataStart if self.depth() > 0 => {
133132
self.into_state(State::InsideCData, next_event)
134133
},
@@ -138,4 +137,55 @@ impl PullParser {
138137
}
139138
}
140139
}
140+
141+
pub fn document_start(&mut self, t: Token) -> Option<Result> {
142+
debug_assert!(self.encountered < Encountered::Declaration);
143+
144+
match t {
145+
Token::Character(c) => {
146+
let next_event = self.set_encountered(Encountered::AnyChars);
147+
148+
if !is_whitespace_char(c) {
149+
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
150+
}
151+
self.inside_whitespace = true;
152+
153+
// skip whitespace outside of the root element
154+
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
155+
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
156+
return self.into_state(State::OutsideTag, next_event);
157+
}
158+
159+
self.push_pos();
160+
self.buf.push(c);
161+
self.into_state(State::OutsideTag, next_event)
162+
},
163+
164+
Token::CommentStart => {
165+
let next_event = self.set_encountered(Encountered::Comment);
166+
self.into_state(State::InsideComment, next_event)
167+
}
168+
169+
Token::OpeningTagStart => {
170+
let next_event = self.set_encountered(Encountered::Element);
171+
self.nst.push_empty();
172+
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
173+
},
174+
175+
Token::DoctypeStart => {
176+
let next_event = self.set_encountered(Encountered::Doctype);
177+
// We don't have a doctype event so skip this position
178+
// FIXME: update when we have a doctype event
179+
self.next_pos();
180+
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
181+
},
182+
183+
Token::ProcessingInstructionStart => {
184+
self.push_pos();
185+
self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
186+
},
187+
188+
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
189+
}
190+
}
141191
}

tests/event_reader.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ fn tabs_1() {
285285
test(
286286
b"\t<a>\t<b/></a>",
287287
br#"
288-
|1:2 StartDocument(1.0, UTF-8)
288+
|1:1 StartDocument(1.0, UTF-8)
289289
|1:2 StartElement(a)
290290
|1:6 StartElement(b)
291291
|1:6 EndElement(b)

tests/oasis.fail.txt

-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
o-p04pass1 p04pass1.xml names with all valid ASCII characters, and one from each other class in NameChar ; 5:8 Element A.-:̀· prefix is unbound
22
o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0 prefix is unbound
3-
o-p01fail1 p01fail1.xml S cannot occur before the prolog
43
o-p09fail1 p09fail1.xml EntityValue excludes '%'
54
o-p09fail2 p09fail2.xml EntityValue excludes '&'
65
o-p12fail1 p12fail1.xml '"' excluded
@@ -10,7 +9,6 @@ o-p12fail4 p12fail4.xml '>' excluded
109
o-p12fail5 p12fail5.xml '<' excluded
1110
o-p12fail6 p12fail6.xml built-in entity refs excluded
1211
o-p12fail7 p12fail7.xml The public ID has a tab character, which is disallowed
13-
o-p22fail1 p22fail1.xml prolog must start with XML decl
1412
o-p30fail1 p30fail1.xml An XML declaration is not the same as a TextDecl
1513
o-p31fail1 p31fail1.xml external subset excludes doctypedecl
1614
o-p32fail3 p32fail3.xml initial S is required

tests/sun-not-wf.fail.txt

-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ pubid02 pubid02.xml Illegal characters in public ID
2727
pubid03 pubid03.xml Illegal characters in public ID
2828
pubid04 pubid04.xml Illegal characters in public ID
2929
pubid05 pubid05.xml SGML-ism: public ID without system ID
30-
sgml02 sgml02.xml XML declaration must be at the very beginning of a document; it"s not a processing instruction
3130
sgml04 sgml04.xml ATTLIST declarations apply to only one element, unlike SGML
3231
sgml05 sgml05.xml ELEMENT declarations apply to only one element, unlike SGML
3332
sgml06 sgml06.xml ATTLIST declarations are never global, unlike in SGML

tests/xmltest.fail.txt

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ not-wf-sa-136 136.xml Tag omission is invalid in XML.
5151
not-wf-sa-137 137.xml Space is required before a content model.
5252
not-wf-sa-138 138.xml Invalid syntax for content particle.
5353
not-wf-sa-139 139.xml The element-content model should not be empty.
54-
not-wf-sa-147 147.xml XML Declaration may not be preceded by whitespace.
5554
not-wf-sa-149 149.xml XML Declaration may not be within a DTD.
5655
not-wf-sa-158 158.xml SGML-ism: "#NOTATION gif" can't have attributes.
5756
not-wf-sa-159 159.xml Uses '&' unquoted in an entity declaration, which is illegal syntax for an entity reference.

0 commit comments

Comments
 (0)