Skip to content

Commit 0f084d4

Browse files
committed
Parse DOCTYPE markup declarations
1 parent 7eedb49 commit 0f084d4

File tree

8 files changed

+132
-49
lines changed

8 files changed

+132
-49
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ clean manner.
2525

2626
This parser is mostly full-featured, however, there are limitations:
2727
* Only UTF-8 is supported;
28-
* DTD validation is not supported, `<!DOCTYPE>` declarations are completely ignored; thus no
28+
* There is only very rudimentary parsing of `<!DOCTYPE>` declarations; thus no
2929
support for custom entities too; internal DTD declarations are likely to cause parsing errors;
30+
* DTD validation is not supported;
3031
* attribute value normalization is not performed, and end-of-line characters are not normalized either.
3132

3233
Other than that the parser tries to be mostly XML-1.1-compliant.

src/reader/lexer.rs

+102-26
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ pub(crate) enum Token {
5454
ReferenceStart,
5555
/// `;`
5656
ReferenceEnd,
57+
/// `<!` of `ENTITY`
58+
MarkupDeclarationStart,
5759
}
5860

5961
impl fmt::Display for Token {
@@ -143,6 +145,7 @@ impl Token {
143145
}
144146
}
145147

148+
#[derive(Copy, Clone)]
146149
enum State {
147150
/// Default state
148151
Normal,
@@ -154,8 +157,10 @@ enum State {
154157
CommentStarted,
155158
/// Triggered on '<!D' up to '<!DOCTYPE'
156159
DoctypeStarted(DoctypeStartedSubstate),
160+
/// Other items like `<!ELEMENT` in DTD
161+
InsideMarkupDeclaration,
157162
/// Triggered after DoctypeStarted to handle sub elements
158-
DoctypeFinishing(u8),
163+
InsideDoctype,
159164
/// Triggered on '<![' up to '<![CDATA'
160165
CDataStarted(CDataStartedSubstate),
161166
/// Triggered on '?'
@@ -174,6 +179,13 @@ enum State {
174179
InsideCdata,
175180
/// After `<?`
176181
InsideProcessingInstruction,
182+
/// `<!ENTITY "here">`
183+
InsideMarkupDeclarationQuotedString(QuoteStyle),
184+
}
185+
186+
#[derive(Copy, Clone, Eq, PartialEq)]
187+
enum QuoteStyle {
188+
Single, Double
177189
}
178190

179191
#[derive(Copy, Clone)]
@@ -229,6 +241,8 @@ pub(crate) struct Lexer {
229241
head_pos: TextPosition,
230242
char_queue: VecDeque<char>,
231243
st: State,
244+
/// Default state to go back to after a tag end (may be `InsideDoctype`)
245+
normal_state: State,
232246
skip_errors: bool,
233247
inside_token: bool,
234248
eof_handled: bool
@@ -248,21 +262,16 @@ impl Lexer {
248262
head_pos: TextPosition::new(),
249263
char_queue: VecDeque::with_capacity(4), // TODO: check size
250264
st: State::Normal,
265+
normal_state: State::Normal,
251266
skip_errors: false,
252267
inside_token: false,
253268
eof_handled: false
254269
}
255270
}
256271

257-
/// Enables error handling so `next_token` will return `Some(Err(..))`
258-
/// upon invalid lexeme.
259-
#[inline]
260-
pub fn enable_errors(&mut self) { self.skip_errors = false; }
261-
262272
/// Disables error handling so `next_token` will return `Some(Chunk(..))`
263273
/// upon invalid lexeme with this lexeme content.
264-
#[inline]
265-
pub fn disable_errors(&mut self) { self.skip_errors = true; }
274+
pub(crate) fn disable_errors(&mut self) { self.skip_errors = true; }
266275

267276
/// Reset the eof handled flag of the lexer.
268277
#[inline]
@@ -326,9 +335,9 @@ impl Lexer {
326335
State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
327336
State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
328337
State::CommentClosing(ClosingSubstate::Second) |
329-
State::InsideComment |
338+
State::InsideComment | State::InsideMarkupDeclaration |
330339
State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
331-
State::DoctypeFinishing(_) =>
340+
State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
332341
Err(self.error("Unexpected end of stream")),
333342
State::EmptyTagClosing =>
334343
Ok(Some(Token::Character('/'))),
@@ -369,7 +378,7 @@ impl Lexer {
369378
State::CommentStarted => self.comment_started(c),
370379
State::CDataStarted(s) => self.cdata_started(c, s),
371380
State::DoctypeStarted(s) => self.doctype_started(c, s),
372-
State::DoctypeFinishing(d) => self.doctype_finishing(c, d),
381+
State::InsideDoctype => self.inside_doctype(c),
373382
State::EmptyTagClosing => self.empty_element_closing(c),
374383
State::CommentClosing(s) => self.comment_closing(c, s),
375384
State::CDataClosing(s) => self.cdata_closing(c, s),
@@ -378,6 +387,8 @@ impl Lexer {
378387
State::InsideCdata => self.inside_cdata(c),
379388
State::InsideProcessingInstruction => self.inside_processing_instruction(c),
380389
State::ProcessingInstructionClosing => self.processing_instruction_closing(c),
390+
State::InsideMarkupDeclaration => self.markup_declaration(c),
391+
State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q),
381392
}
382393
}
383394

@@ -393,6 +404,13 @@ impl Lexer {
393404
Ok(Some(token))
394405
}
395406

407+
#[inline]
408+
fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result {
409+
self.normal_state = st;
410+
self.st = st;
411+
Ok(Some(token))
412+
}
413+
396414
#[inline]
397415
fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
398416
self.char_queue.extend(cs.iter().copied());
@@ -434,6 +452,7 @@ impl Lexer {
434452
}
435453

436454
fn inside_processing_instruction(&mut self, c: char) -> Result {
455+
// These tokens are used by `<?xml?>` parser
437456
match c {
438457
'?' => self.move_to(State::ProcessingInstructionClosing),
439458
'<' => Ok(Some(Token::OpeningTagStart)),
@@ -461,10 +480,10 @@ impl Lexer {
461480
fn tag_opened(&mut self, c: char) -> Result {
462481
match c {
463482
'?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart),
464-
'/' => self.move_to_with(State::Normal, Token::ClosingTagStart),
483+
'/' => self.move_to_with(self.normal_state, Token::ClosingTagStart),
465484
'!' => self.move_to(State::CommentOrCDataOrDoctypeStarted),
466-
_ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
467-
_ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
485+
_ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
486+
_ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
468487
_ => self.handle_error("<", c)
469488
}
470489
}
@@ -475,6 +494,7 @@ impl Lexer {
475494
'-' => self.move_to(State::CommentStarted),
476495
'[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
477496
'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
497+
'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => self.move_to_with(State::InsideMarkupDeclaration, Token::MarkupDeclarationStart),
478498
_ => self.handle_error("<!", c),
479499
}
480500
}
@@ -500,6 +520,27 @@ impl Lexer {
500520
)
501521
}
502522

523+
/// Encountered '<!…' that isn't DOCTYPE or CDATA
524+
fn markup_declaration(&mut self, c: char) -> Result {
525+
match c {
526+
'<' => self.handle_error("<!", c),
527+
'>' => self.move_to_with(self.normal_state, Token::TagEnd),
528+
'&' => Ok(Some(Token::ReferenceStart)),
529+
';' => Ok(Some(Token::ReferenceEnd)),
530+
'"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote),
531+
'\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote),
532+
_ => Ok(None),
533+
}
534+
}
535+
536+
fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result {
537+
match c {
538+
'"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
539+
'\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
540+
_ => Ok(None),
541+
}
542+
}
543+
503544
/// Encountered '<!D'
504545
fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
505546
use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
@@ -509,33 +550,34 @@ impl Lexer {
509550
DOC ; 'T' ; DOCT ; "<!DOC",
510551
DOCT ; 'Y' ; DOCTY ; "<!DOCT",
511552
DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
512-
DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
553+
DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)
513554
)
514555
}
515556

516557
/// State used while awaiting the closing bracket for the <!DOCTYPE tag
517-
fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
558+
fn inside_doctype(&mut self, c: char) -> Result {
518559
match c {
519-
'<' => self.move_to(State::DoctypeFinishing(d + 1)),
520-
'>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
521-
'>' => self.move_to(State::DoctypeFinishing(d - 1)),
560+
'>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd),
561+
'<' => self.move_to(State::TagStarted),
562+
'&' => Ok(Some(Token::ReferenceStart)),
563+
';' => Ok(Some(Token::ReferenceEnd)),
522564
_ => Ok(None),
523565
}
524566
}
525567

526568
/// Encountered '?'
527569
fn processing_instruction_closing(&mut self, c: char) -> Result {
528570
match c {
529-
'>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
571+
'>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
530572
_ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
531573
}
532574
}
533575

534576
/// Encountered '/'
535577
fn empty_element_closing(&mut self, c: char) -> Result {
536578
match c {
537-
'>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
538-
_ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
579+
'>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
580+
_ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
539581
}
540582
}
541583

@@ -547,7 +589,7 @@ impl Lexer {
547589
_ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')),
548590
},
549591
ClosingSubstate::Second => match c {
550-
'>' => self.move_to_with(State::Normal, Token::CommentEnd),
592+
'>' => self.move_to_with(self.normal_state, Token::CommentEnd),
551593
// double dash not followed by a greater-than is a hard error inside comment
552594
_ => self.handle_error("--", c),
553595
},
@@ -576,7 +618,7 @@ impl Lexer {
576618
_ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')),
577619
},
578620
ClosingSubstate::Second => match c {
579-
'>' => self.move_to_with(State::Normal, Token::CDataEnd),
621+
'>' => self.move_to_with(self.normal_state, Token::CDataEnd),
580622
_ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')),
581623
},
582624
}
@@ -825,19 +867,54 @@ mod tests {
825867
#[test]
826868
fn doctype_with_internal_subset_test() {
827869
let (mut lex, mut buf) = make_lex_and_buf(
828-
r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
870+
r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>>>"> ]> "#
829871
);
830872
assert_oks!(for lex and buf ;
831873
Token::OpeningTagStart
832874
Token::Character('a')
833875
Token::TagEnd
834876
Token::DoctypeStart
877+
Token::MarkupDeclarationStart
878+
Token::DoubleQuote
879+
Token::DoubleQuote
880+
Token::TagEnd
835881
Token::TagEnd
836882
Token::Whitespace(' ')
837883
);
838884
assert_none!(for lex and buf);
839885
}
840886

887+
#[test]
888+
fn doctype_internal_pi_comment() {
889+
let (mut lex, mut buf) = make_lex_and_buf(
890+
"<!DOCTYPE a [\n<!ELEMENT leopard ANY> <!-- <?non?>--> <?pi > ?> \n]>"
891+
);
892+
assert_oks!(for lex and buf ;
893+
Token::DoctypeStart
894+
Token::MarkupDeclarationStart
895+
Token::TagEnd
896+
Token::CommentStart
897+
Token::Whitespace(' ')
898+
Token::Character('<')
899+
Token::Character('?')
900+
Token::Character('n')
901+
Token::Character('o')
902+
Token::Character('n')
903+
Token::Character('?')
904+
Token::Character('>')
905+
Token::CommentEnd
906+
Token::ProcessingInstructionStart
907+
Token::Character('p')
908+
Token::Character('i')
909+
Token::Whitespace(' ')
910+
Token::TagEnd // not really
911+
Token::Whitespace(' ')
912+
Token::ProcessingInstructionEnd
913+
Token::TagEnd // DTD
914+
);
915+
assert_none!(for lex and buf);
916+
}
917+
841918
#[test]
842919
fn end_of_stream_handling_ok() {
843920
macro_rules! eof_check(
@@ -872,7 +949,6 @@ mod tests {
872949
eof_check!("<![CDA" ; 0, 6);
873950
eof_check!("<![CDAT" ; 0, 7);
874951
eof_check!("<![CDATA" ; 0, 8);
875-
// eof_check!("--" ; 0, 2);
876952
}
877953

878954
#[test]

src/reader/parser/inside_cdata.rs

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ impl PullParser {
77
pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
88
match t {
99
Token::CDataEnd => {
10-
self.lexer.enable_errors();
1110
let event = if self.config.cdata_to_characters {
1211
None
1312
} else {

src/reader/parser/inside_doctype.rs

+14-1
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,23 @@ impl PullParser {
66
pub fn inside_doctype(&mut self, t: Token) -> Option<Result> {
77
match t {
88
Token::TagEnd => {
9-
self.lexer.enable_errors();
109
self.into_state_continue(State::OutsideTag)
1110
}
1211

12+
Token::MarkupDeclarationStart => {
13+
self.into_state_continue(State::InsideDoctypeMarkupDeclaration)
14+
},
15+
16+
_ => None,
17+
}
18+
}
19+
20+
pub fn inside_doctype_markup_declaration(&mut self, t: Token) -> Option<Result> {
21+
match t {
22+
Token::TagEnd => {
23+
self.into_state_continue(State::InsideDoctype)
24+
}
25+
1326
_ => None,
1427
}
1528
}

src/reader/parser/inside_processing_instruction.rs

-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ impl PullParser {
6868

6969
ProcessingInstructionSubstate::PIInsideData => match t {
7070
Token::ProcessingInstructionEnd => {
71-
self.lexer.enable_errors();
7271
let name = self.data.take_name();
7372
let data = self.take_buf();
7473
self.into_state_emit(

src/reader/parser/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ pub enum State {
139139
InsideCData,
140140
InsideDeclaration(DeclarationSubstate),
141141
InsideDoctype,
142+
InsideDoctypeMarkupDeclaration,
142143
InsideReference(Box<State>),
143144
}
144145

@@ -337,6 +338,7 @@ impl PullParser {
337338
State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
338339
State::InsideDeclaration(s) => self.inside_declaration(t, s),
339340
State::InsideDoctype => self.inside_doctype(t),
341+
State::InsideDoctypeMarkupDeclaration => self.inside_doctype_markup_declaration(t),
340342
State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
341343
State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
342344
State::InsideComment => self.inside_comment(t),

src/reader/parser/outside_tag.rs

-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ impl PullParser {
8181
// We don't have a doctype event so skip this position
8282
// FIXME: update when we have a doctype event
8383
self.next_pos();
84-
self.lexer.disable_errors();
8584
self.into_state(State::InsideDoctype, next_event)
8685
}
8786

0 commit comments

Comments
 (0)