Skip to content

Commit 014d808

Browse files
committed
Parse predefined entities
1 parent ae9d83b commit 014d808

File tree

8 files changed

+136
-20
lines changed

8 files changed

+136
-20
lines changed

src/reader/lexer.rs

+27-1
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,9 @@ impl Lexer {
590590
'<' => self.move_to(State::TagStarted),
591591
'&' => Ok(Some(Token::ReferenceStart)),
592592
';' => Ok(Some(Token::ReferenceEnd)),
593-
_ => Ok(None),
593+
'"' => Ok(Some(Token::DoubleQuote)),
594+
'\'' => Ok(Some(Token::SingleQuote)),
595+
_ => Ok(Some(Token::Character(c))),
594596
}
595597
}
596598

@@ -886,6 +888,14 @@ mod tests {
886888
Token::Character('a')
887889
Token::TagEnd
888890
Token::DoctypeStart
891+
Token::Character(' ')
892+
Token::Character('a')
893+
Token::Character('b')
894+
Token::Character(' ')
895+
Token::Character('x')
896+
Token::Character('x')
897+
Token::Character(' ')
898+
Token::Character('z')
889899
Token::TagEnd
890900
Token::Character(' ')
891901
);
@@ -925,6 +935,10 @@ mod tests {
925935
Token::Character('a')
926936
Token::TagEnd
927937
Token::DoctypeStart
938+
Token::Character(' ')
939+
Token::Character('a')
940+
Token::Character('b')
941+
Token::Character('[')
928942
Token::MarkupDeclarationStart
929943
Token::Character('E')
930944
Token::Character('L')
@@ -943,6 +957,8 @@ mod tests {
943957
Token::Character('>')
944958
Token::DoubleQuote
945959
Token::TagEnd
960+
Token::Character(' ')
961+
Token::Character(']')
946962
Token::TagEnd
947963
Token::Character(' ')
948964
);
@@ -956,6 +972,11 @@ mod tests {
956972
);
957973
assert_oks!(for lex and buf ;
958974
Token::DoctypeStart
975+
Token::Character(' ')
976+
Token::Character('a')
977+
Token::Character(' ')
978+
Token::Character('[')
979+
Token::Character('\n')
959980
Token::MarkupDeclarationStart
960981
Token::Character('E')
961982
Token::Character('L')
@@ -971,6 +992,7 @@ mod tests {
971992
Token::Character('N')
972993
Token::Character('Y')
973994
Token::TagEnd
995+
Token::Character(' ')
974996
Token::CommentStart
975997
Token::Character(' ')
976998
Token::Character('<')
@@ -981,13 +1003,17 @@ mod tests {
9811003
Token::Character('?')
9821004
Token::Character('>')
9831005
Token::CommentEnd
1006+
Token::Character(' ')
9841007
Token::ProcessingInstructionStart
9851008
Token::Character('p')
9861009
Token::Character('i')
9871010
Token::Character(' ')
9881011
Token::TagEnd // not really
9891012
Token::Character(' ')
9901013
Token::ProcessingInstructionEnd
1014+
Token::Character(' ')
1015+
Token::Character('\n')
1016+
Token::Character(']')
9911017
Token::TagEnd // DTD
9921018
);
9931019
assert_none!(for lex and buf);

src/reader/parser.rs

+8
Original file line numberDiff line numberDiff line change
@@ -193,14 +193,22 @@ pub enum State {
193193
#[derive(Copy, Clone, PartialEq)]
194194
pub enum DoctypeSubstate {
195195
Outside,
196+
String,
196197
InsideName,
197198
BeforeEntityName,
198199
EntityName,
199200
BeforeEntityValue,
200201
EntityValue,
201202
NumericReferenceStart,
202203
NumericReference,
204+
/// expansion
205+
PEReferenceInValue,
206+
PEReferenceInDtd,
207+
/// name definition
208+
PEReferenceDefinitionStart,
209+
PEReferenceDefinition,
203210
SkipDeclaration,
211+
Comment,
204212
}
205213

206214
#[derive(Copy, Clone, PartialEq)]

src/reader/parser/inside_doctype.rs

+100-7
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,43 @@ impl PullParser {
88
DoctypeSubstate::Outside => match t {
99
Token::TagEnd => {
1010
self.into_state_continue(State::OutsideTag)
11-
}
12-
11+
},
1312
Token::MarkupDeclarationStart => {
1413
self.buf.clear();
1514
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName))
1615
},
16+
Token::Character('%') => {
17+
self.data.ref_data.clear();
18+
self.data.ref_data.push('%');
19+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd))
20+
},
21+
Token::CommentStart => {
22+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment))
23+
},
24+
Token::SingleQuote | Token::DoubleQuote => {
25+
// just discard string literals
26+
self.data.quote = Some(super::QuoteToken::from_token(&t));
27+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String))
28+
},
29+
Token::CDataEnd | Token::CDataStart => Some(self_error!(self; "Unexpected token {}", t)),
1730
// TODO: parse SYSTEM, and [
1831
_ => None,
1932
},
33+
DoctypeSubstate::String => match t {
34+
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { None },
35+
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { None },
36+
Token::SingleQuote | Token::DoubleQuote => {
37+
self.data.quote = None;
38+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
39+
},
40+
_ => None,
41+
},
42+
DoctypeSubstate::Comment => match t {
43+
Token::CommentEnd => {
44+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
45+
},
46+
_ => None,
47+
},
2048
DoctypeSubstate::InsideName => match t {
2149
Token::Character(c @ 'A'..='Z') => {
2250
self.buf.push(c);
@@ -36,8 +64,10 @@ impl PullParser {
3664
self.data.name.clear();
3765
match t {
3866
Token::Character(c) if is_whitespace_char(c) => None,
39-
// PEDecl unsupported
40-
Token::Character('%') => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
67+
Token::Character('%') => { // % is for PEDecl
68+
self.data.name.push('%');
69+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
70+
},
4171
Token::Character(c) if is_name_start_char(c) => {
4272
self.data.name.push(c);
4373
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
@@ -77,35 +107,98 @@ impl PullParser {
77107
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None },
78108
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None },
79109
Token::SingleQuote | Token::DoubleQuote => {
110+
self.data.quote = None;
80111
let name = self.data.take_name();
81112
let val = self.take_buf();
82-
self.data.quote = None;
83113
self.entities.entry(name).or_insert(val); // First wins
84114
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME
85115
},
86-
Token::Character('&') => {
116+
Token::ReferenceStart | Token::Character('&') => {
87117
self.data.ref_data.clear();
88118
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart))
89119
},
120+
Token::Character('%') => {
121+
self.data.ref_data.clear();
122+
self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities
123+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue))
124+
},
90125
Token::Character(c) => {
91126
self.buf.push(c);
92127
None
93128
},
94129
_ => Some(self_error!(self; "Expected entity value, found {}", t)),
95130
},
131+
DoctypeSubstate::PEReferenceDefinitionStart => match t {
132+
Token::Character(c) if is_whitespace_char(c) => {
133+
None
134+
},
135+
Token::Character(c) if is_name_start_char(c) => {
136+
debug_assert_eq!(self.data.name, "%");
137+
self.data.name.push(c);
138+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition))
139+
},
140+
_ => Some(self_error!(self; "Unexpected {} in entity", t)),
141+
},
142+
DoctypeSubstate::PEReferenceDefinition => match t {
143+
Token::Character(c) if is_name_char(c) => {
144+
self.data.name.push(c);
145+
None
146+
},
147+
Token::Character(c) if is_whitespace_char(c) => {
148+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
149+
},
150+
_ => Some(self_error!(self; "Unexpected {} in entity", t)),
151+
},
152+
DoctypeSubstate::PEReferenceInDtd => match t {
153+
Token::Character(c) if is_name_char(c) => {
154+
self.data.ref_data.push(c);
155+
None
156+
},
157+
Token::ReferenceEnd | Token::Character(';') => {
158+
let name = self.data.take_ref_data();
159+
match self.entities.get(&name) {
160+
Some(ent) => {
161+
if let Err(e) = self.lexer.reparse(ent) {
162+
return Some(Err(e));
163+
}
164+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
165+
},
166+
None => Some(self_error!(self; "Undefined PE entity {}", name)),
167+
}
168+
},
169+
_ => Some(self_error!(self; "Unexpected {} in entity", t)),
170+
},
171+
DoctypeSubstate::PEReferenceInValue => match t {
172+
Token::Character(c) if is_name_char(c) => {
173+
self.data.ref_data.push(c);
174+
None
175+
},
176+
Token::ReferenceEnd | Token::Character(';') => {
177+
let name = self.data.take_ref_data();
178+
match self.entities.get(&name) {
179+
Some(ent) => {
180+
self.buf.push_str(ent);
181+
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
182+
},
183+
None => Some(self_error!(self; "Undefined PE entity {}", name)),
184+
}
185+
},
186+
_ => Some(self_error!(self; "Unexpected {} in entity", t)),
187+
},
96188
DoctypeSubstate::NumericReferenceStart => match t {
97189
Token::Character('#') => {
98190
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference))
99191
},
100192
Token::Character(c) => {
101193
self.buf.push('&');
102194
self.buf.push(c);
195+
// named entities are not expanded inside doctype
103196
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
104197
},
105198
_ => Some(self_error!(self; "Unexpected {} in entity", t)),
106199
},
107200
DoctypeSubstate::NumericReference => match t {
108-
Token::Character(';') => {
201+
Token::ReferenceEnd | Token::Character(';') => {
109202
let r = self.data.take_ref_data();
110203
// https://www.w3.org/TR/xml/#sec-entexpand
111204
match self.numeric_reference_from_str(&r) {

tests/errata2e.fail.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
rmt-e2e-18 E18.xml External entity containing start of entity declaration is base URI for system identifier ; 7:10 Unexpected entity: ent
1+
rmt-e2e-18 E18.xml External entity containing start of entity declaration is base URI for system identifier ; 5:7 Undefined PE entity %intpe
22
rmt-e2e-19 E19.xml Parameter entities and character references are included-in-literal, but general entities are bypassed. ; 2:10 Unexpected entity: ent
33
rmt-e2e-34 E34.xml A non-deterministic content model is an error even if the element type is not used.
44
rmt-e2e-38 E38.xml XML 1.0 document refers to 1.1 entity

tests/oasis.fail.txt

-7
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0
33
o-p01fail1 p01fail1.xml S cannot occur before the prolog
44
o-p09fail1 p09fail1.xml EntityValue excludes '%'
55
o-p09fail2 p09fail2.xml EntityValue excludes '&'
6-
o-p09fail3 p09fail3.xml incomplete character reference
76
o-p12fail1 p12fail1.xml '"' excluded
87
o-p12fail2 p12fail2.xml '\' excluded
98
o-p12fail3 p12fail3.xml entity references excluded
@@ -77,16 +76,10 @@ o-p63fail2 p63fail2.xml delimiters must be balanced
7776
o-p64fail1 p64fail1.xml section delimiters must balance
7877
o-p64fail2 p64fail2.xml section delimiters must balance
7978
o-p66fail5 p66fail5.xml no references to non-characters
80-
o-p69fail1 p69fail1.xml terminating ';' is required
81-
o-p69fail2 p69fail2.xml no S after '%'
82-
o-p69fail3 p69fail3.xml no S before ';'
8379
o-p72fail2 p72fail2.xml S is required after '%'
84-
o-p72fail3 p72fail3.xml S is required after name
85-
o-p72fail4 p72fail4.xml Entity name is a name, not an NMToken
8680
o-p73fail2 p73fail2.xml Only one replacement value
8781
o-p73fail3 p73fail3.xml No NDataDecl on replacement text
8882
o-p74fail1 p74fail1.xml no NDataDecls on parameter entities
89-
o-p74fail2 p74fail2.xml value is required
9083
o-p74fail3 p74fail3.xml only one value
9184
o-p75fail1 p75fail1.xml S required after "PUBLIC"
9285
o-p75fail2 p75fail2.xml S required after "SYSTEM"

tests/sun-not-wf.fail.txt

-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ content03 content03.xml No whitespace before "+" in content model
1717
decl01 decl01.xml External entities may not have standalone decls.
1818
nwf-dtd00 dtd00.xml Comma mandatory in content model
1919
nwf-dtd01 dtd01.xml Can't mix comma and vertical bar in content models
20-
dtd02 dtd02.xml PE name immediately after "%"
21-
dtd03 dtd03.xml PE name immediately followed by ";"
2220
dtd04 dtd04.xml PUBLIC literal must be quoted
2321
dtd05 dtd05.xml SYSTEM identifier must be quoted
2422
dtd07 dtd07.xml Text declarations (which optionally begin any external entity) are required to have "encoding=...".

tests/sun-valid.fail.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
not-sa03 not-sa03.xml A non-standalone document is valid if declared as such.; 19:20 Unexpected entity: internal
22
v-pe00 pe00.xml Tests construction of internal entity replacement text, using an example in the XML specification. ; 2:12 Unexpected entity: book
3-
v-pe02 pe02.xml Tests construction of internal entity replacement text, using a complex example in the XML specification. ; 8:34 Unexpected entity: tricky

tests/xmltest.fail.txt

-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ not-wf-sa-089 089.xml Parameter entities "are" always parsed; NDATA annotation
2929
not-wf-sa-091 091.xml Parameter entities "are" always parsed; NDATA annotations are not permitted.
3030
not-wf-sa-096 096.xml Space is required before the standalone declaration.
3131
not-wf-sa-104 104.xml Internal general parsed entities are only well formed if they match the "content" production.
32-
not-wf-sa-113 113.xml Parameter entity values must use valid reference syntax; this reference is malformed.
3332
not-wf-sa-115 115.xml The replacement text of this entity is an illegal character reference, which must be rejected when it is parsed in the context of an attribute value.
3433
not-wf-sa-116 116.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one.
3534
not-wf-sa-117 117.xml Internal general parsed entities are only well formed if they match the "content" production. This is a partial character reference, not a full one.

0 commit comments

Comments
 (0)