@@ -54,6 +54,8 @@ pub(crate) enum Token {
54
54
ReferenceStart ,
55
55
/// `;`
56
56
ReferenceEnd ,
57
+ /// `<!` of `ENTITY`
58
+ MarkupDeclarationStart ,
57
59
}
58
60
59
61
impl fmt:: Display for Token {
@@ -143,6 +145,7 @@ impl Token {
143
145
}
144
146
}
145
147
148
+ #[ derive( Copy , Clone ) ]
146
149
enum State {
147
150
/// Default state
148
151
Normal ,
@@ -154,8 +157,10 @@ enum State {
154
157
CommentStarted ,
155
158
/// Triggered on '<!D' up to '<!DOCTYPE'
156
159
DoctypeStarted ( DoctypeStartedSubstate ) ,
160
+ /// Other items like `<!ELEMENT` in DTD
161
+ InsideMarkupDeclaration ,
157
162
/// Triggered after DoctypeStarted to handle sub elements
158
- DoctypeFinishing ( u8 ) ,
163
+ InsideDoctype ,
159
164
/// Triggered on '<![' up to '<![CDATA'
160
165
CDataStarted ( CDataStartedSubstate ) ,
161
166
/// Triggered on '?'
@@ -174,6 +179,13 @@ enum State {
174
179
InsideCdata ,
175
180
/// After `<?`
176
181
InsideProcessingInstruction ,
182
+ /// `<!ENTITY "here">`
183
+ InsideMarkupDeclarationQuotedString ( QuoteStyle ) ,
184
+ }
185
+
186
+ #[ derive( Copy , Clone , Eq , PartialEq ) ]
187
+ enum QuoteStyle {
188
+ Single , Double
177
189
}
178
190
179
191
#[ derive( Copy , Clone ) ]
@@ -229,6 +241,8 @@ pub(crate) struct Lexer {
229
241
head_pos : TextPosition ,
230
242
char_queue : VecDeque < char > ,
231
243
st : State ,
244
+ /// Default state to go back to after a tag end (may be `InsideDoctype`)
245
+ normal_state : State ,
232
246
skip_errors : bool ,
233
247
inside_token : bool ,
234
248
eof_handled : bool
@@ -248,21 +262,16 @@ impl Lexer {
248
262
head_pos : TextPosition :: new ( ) ,
249
263
char_queue : VecDeque :: with_capacity ( 4 ) , // TODO: check size
250
264
st : State :: Normal ,
265
+ normal_state : State :: Normal ,
251
266
skip_errors : false ,
252
267
inside_token : false ,
253
268
eof_handled : false
254
269
}
255
270
}
256
271
257
- /// Enables error handling so `next_token` will return `Some(Err(..))`
258
- /// upon invalid lexeme.
259
- #[ inline]
260
- pub fn enable_errors ( & mut self ) { self . skip_errors = false ; }
261
-
262
272
/// Disables error handling so `next_token` will return `Some(Chunk(..))`
263
273
/// upon invalid lexeme with this lexeme content.
264
- #[ inline]
265
- pub fn disable_errors ( & mut self ) { self . skip_errors = true ; }
274
+ pub ( crate ) fn disable_errors ( & mut self ) { self . skip_errors = true ; }
266
275
267
276
/// Reset the eof handled flag of the lexer.
268
277
#[ inline]
@@ -326,9 +335,9 @@ impl Lexer {
326
335
State :: TagStarted | State :: CommentOrCDataOrDoctypeStarted |
327
336
State :: CommentStarted | State :: CDataStarted ( _) | State :: DoctypeStarted ( _) |
328
337
State :: CommentClosing ( ClosingSubstate :: Second ) |
329
- State :: InsideComment |
338
+ State :: InsideComment | State :: InsideMarkupDeclaration |
330
339
State :: InsideProcessingInstruction | State :: ProcessingInstructionClosing |
331
- State :: DoctypeFinishing ( _) =>
340
+ State :: InsideDoctype | State :: InsideMarkupDeclarationQuotedString ( _) =>
332
341
Err ( self . error ( "Unexpected end of stream" ) ) ,
333
342
State :: EmptyTagClosing =>
334
343
Ok ( Some ( Token :: Character ( '/' ) ) ) ,
@@ -369,7 +378,7 @@ impl Lexer {
369
378
State :: CommentStarted => self . comment_started ( c) ,
370
379
State :: CDataStarted ( s) => self . cdata_started ( c, s) ,
371
380
State :: DoctypeStarted ( s) => self . doctype_started ( c, s) ,
372
- State :: DoctypeFinishing ( d ) => self . doctype_finishing ( c , d ) ,
381
+ State :: InsideDoctype => self . inside_doctype ( c ) ,
373
382
State :: EmptyTagClosing => self . empty_element_closing ( c) ,
374
383
State :: CommentClosing ( s) => self . comment_closing ( c, s) ,
375
384
State :: CDataClosing ( s) => self . cdata_closing ( c, s) ,
@@ -378,6 +387,8 @@ impl Lexer {
378
387
State :: InsideCdata => self . inside_cdata ( c) ,
379
388
State :: InsideProcessingInstruction => self . inside_processing_instruction ( c) ,
380
389
State :: ProcessingInstructionClosing => self . processing_instruction_closing ( c) ,
390
+ State :: InsideMarkupDeclaration => self . markup_declaration ( c) ,
391
+ State :: InsideMarkupDeclarationQuotedString ( q) => self . markup_declaration_string ( c, q) ,
381
392
}
382
393
}
383
394
@@ -393,6 +404,13 @@ impl Lexer {
393
404
Ok ( Some ( token) )
394
405
}
395
406
407
+ #[ inline]
408
+ fn move_to_and_reset_normal ( & mut self , st : State , token : Token ) -> Result {
409
+ self . normal_state = st;
410
+ self . st = st;
411
+ Ok ( Some ( token) )
412
+ }
413
+
396
414
#[ inline]
397
415
fn move_to_with_unread ( & mut self , st : State , cs : & [ char ] , token : Token ) -> Result {
398
416
self . char_queue . extend ( cs. iter ( ) . copied ( ) ) ;
@@ -434,6 +452,7 @@ impl Lexer {
434
452
}
435
453
436
454
fn inside_processing_instruction ( & mut self , c : char ) -> Result {
455
+ // These tokens are used by `<?xml?>` parser
437
456
match c {
438
457
'?' => self . move_to ( State :: ProcessingInstructionClosing ) ,
439
458
'<' => Ok ( Some ( Token :: OpeningTagStart ) ) ,
@@ -461,10 +480,10 @@ impl Lexer {
461
480
fn tag_opened ( & mut self , c : char ) -> Result {
462
481
match c {
463
482
'?' => self . move_to_with ( State :: InsideProcessingInstruction , Token :: ProcessingInstructionStart ) ,
464
- '/' => self . move_to_with ( State :: Normal , Token :: ClosingTagStart ) ,
483
+ '/' => self . move_to_with ( self . normal_state , Token :: ClosingTagStart ) ,
465
484
'!' => self . move_to ( State :: CommentOrCDataOrDoctypeStarted ) ,
466
- _ if is_whitespace_char ( c) => self . move_to_with_unread ( State :: Normal , & [ c] , Token :: OpeningTagStart ) ,
467
- _ if is_name_char ( c) => self . move_to_with_unread ( State :: Normal , & [ c] , Token :: OpeningTagStart ) ,
485
+ _ if is_whitespace_char ( c) => self . move_to_with_unread ( self . normal_state , & [ c] , Token :: OpeningTagStart ) ,
486
+ _ if is_name_char ( c) => self . move_to_with_unread ( self . normal_state , & [ c] , Token :: OpeningTagStart ) ,
468
487
_ => self . handle_error ( "<" , c)
469
488
}
470
489
}
@@ -475,6 +494,7 @@ impl Lexer {
475
494
'-' => self . move_to ( State :: CommentStarted ) ,
476
495
'[' => self . move_to ( State :: CDataStarted ( CDataStartedSubstate :: E ) ) ,
477
496
'D' => self . move_to ( State :: DoctypeStarted ( DoctypeStartedSubstate :: D ) ) ,
497
+ 'E' | 'A' | 'N' if matches ! ( self . normal_state, State :: InsideDoctype ) => self . move_to_with ( State :: InsideMarkupDeclaration , Token :: MarkupDeclarationStart ) ,
478
498
_ => self . handle_error ( "<!" , c) ,
479
499
}
480
500
}
@@ -500,6 +520,27 @@ impl Lexer {
500
520
)
501
521
}
502
522
523
+ /// Encountered '<!…' that isn't DOCTYPE or CDATA
524
+ fn markup_declaration ( & mut self , c : char ) -> Result {
525
+ match c {
526
+ '<' => self . handle_error ( "<!" , c) ,
527
+ '>' => self . move_to_with ( self . normal_state , Token :: TagEnd ) ,
528
+ '&' => Ok ( Some ( Token :: ReferenceStart ) ) ,
529
+ ';' => Ok ( Some ( Token :: ReferenceEnd ) ) ,
530
+ '"' => self . move_to_with ( State :: InsideMarkupDeclarationQuotedString ( QuoteStyle :: Double ) , Token :: DoubleQuote ) ,
531
+ '\'' => self . move_to_with ( State :: InsideMarkupDeclarationQuotedString ( QuoteStyle :: Single ) , Token :: SingleQuote ) ,
532
+ _ => Ok ( None ) ,
533
+ }
534
+ }
535
+
536
+ fn markup_declaration_string ( & mut self , c : char , q : QuoteStyle ) -> Result {
537
+ match c {
538
+ '"' if q == QuoteStyle :: Double => self . move_to_with ( State :: InsideMarkupDeclaration , Token :: DoubleQuote ) ,
539
+ '\'' if q == QuoteStyle :: Single => self . move_to_with ( State :: InsideMarkupDeclaration , Token :: SingleQuote ) ,
540
+ _ => Ok ( None ) ,
541
+ }
542
+ }
543
+
503
544
/// Encountered '<!D'
504
545
fn doctype_started ( & mut self , c : char , s : DoctypeStartedSubstate ) -> Result {
505
546
use self :: DoctypeStartedSubstate :: { D , DO , DOC , DOCT , DOCTY , DOCTYP } ;
@@ -509,33 +550,34 @@ impl Lexer {
509
550
DOC ; 'T' ; DOCT ; "<!DOC" ,
510
551
DOCT ; 'Y' ; DOCTY ; "<!DOCT" ,
511
552
DOCTY ; 'P' ; DOCTYP ; "<!DOCTY" ;
512
- DOCTYP ; 'E' ; "<!DOCTYP" ; self . move_to_with ( State :: DoctypeFinishing ( 1 ) , Token :: DoctypeStart )
553
+ DOCTYP ; 'E' ; "<!DOCTYP" ; self . move_to_and_reset_normal ( State :: InsideDoctype , Token :: DoctypeStart )
513
554
)
514
555
}
515
556
516
557
/// State used while awaiting the closing bracket for the <!DOCTYPE tag
517
- fn doctype_finishing ( & mut self , c : char , d : u8 ) -> Result {
558
+ fn inside_doctype ( & mut self , c : char ) -> Result {
518
559
match c {
519
- '<' => self . move_to ( State :: DoctypeFinishing ( d + 1 ) ) ,
520
- '>' if d == 1 => self . move_to_with ( State :: Normal , Token :: TagEnd ) ,
521
- '>' => self . move_to ( State :: DoctypeFinishing ( d - 1 ) ) ,
560
+ '>' => self . move_to_and_reset_normal ( State :: Normal , Token :: TagEnd ) ,
561
+ '<' => self . move_to ( State :: TagStarted ) ,
562
+ '&' => Ok ( Some ( Token :: ReferenceStart ) ) ,
563
+ ';' => Ok ( Some ( Token :: ReferenceEnd ) ) ,
522
564
_ => Ok ( None ) ,
523
565
}
524
566
}
525
567
526
568
/// Encountered '?'
527
569
fn processing_instruction_closing ( & mut self , c : char ) -> Result {
528
570
match c {
529
- '>' => self . move_to_with ( State :: Normal , Token :: ProcessingInstructionEnd ) ,
571
+ '>' => self . move_to_with ( self . normal_state , Token :: ProcessingInstructionEnd ) ,
530
572
_ => self . move_to_with_unread ( State :: InsideProcessingInstruction , & [ c] , Token :: Character ( '?' ) ) ,
531
573
}
532
574
}
533
575
534
576
/// Encountered '/'
535
577
fn empty_element_closing ( & mut self , c : char ) -> Result {
536
578
match c {
537
- '>' => self . move_to_with ( State :: Normal , Token :: EmptyTagEnd ) ,
538
- _ => self . move_to_with_unread ( State :: Normal , & [ c] , Token :: Character ( '/' ) ) ,
579
+ '>' => self . move_to_with ( self . normal_state , Token :: EmptyTagEnd ) ,
580
+ _ => self . move_to_with_unread ( self . normal_state , & [ c] , Token :: Character ( '/' ) ) ,
539
581
}
540
582
}
541
583
@@ -547,7 +589,7 @@ impl Lexer {
547
589
_ => self . move_to_with_unread ( State :: InsideComment , & [ c] , Token :: Character ( '-' ) ) ,
548
590
} ,
549
591
ClosingSubstate :: Second => match c {
550
- '>' => self . move_to_with ( State :: Normal , Token :: CommentEnd ) ,
592
+ '>' => self . move_to_with ( self . normal_state , Token :: CommentEnd ) ,
551
593
// double dash not followed by a greater-than is a hard error inside comment
552
594
_ => self . handle_error ( "--" , c) ,
553
595
} ,
@@ -576,7 +618,7 @@ impl Lexer {
576
618
_ => self . move_to_with_unread ( State :: Normal , & [ c] , Token :: Character ( ']' ) ) ,
577
619
} ,
578
620
ClosingSubstate :: Second => match c {
579
- '>' => self . move_to_with ( State :: Normal , Token :: CDataEnd ) ,
621
+ '>' => self . move_to_with ( self . normal_state , Token :: CDataEnd ) ,
580
622
_ => self . move_to_with_unread ( State :: Normal , & [ ']' , c] , Token :: Character ( ']' ) ) ,
581
623
} ,
582
624
}
@@ -825,19 +867,54 @@ mod tests {
825
867
#[ test]
826
868
fn doctype_with_internal_subset_test ( ) {
827
869
let ( mut lex, mut buf) = make_lex_and_buf (
828
- r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
870
+ r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>>>" > ]> "#
829
871
) ;
830
872
assert_oks ! ( for lex and buf ;
831
873
Token :: OpeningTagStart
832
874
Token :: Character ( 'a' )
833
875
Token :: TagEnd
834
876
Token :: DoctypeStart
877
+ Token :: MarkupDeclarationStart
878
+ Token :: DoubleQuote
879
+ Token :: DoubleQuote
880
+ Token :: TagEnd
835
881
Token :: TagEnd
836
882
Token :: Whitespace ( ' ' )
837
883
) ;
838
884
assert_none ! ( for lex and buf) ;
839
885
}
840
886
887
+ #[ test]
888
+ fn doctype_internal_pi_comment ( ) {
889
+ let ( mut lex, mut buf) = make_lex_and_buf (
890
+ "<!DOCTYPE a [\n <!ELEMENT leopard ANY> <!-- <?non?>--> <?pi > ?> \n ]>"
891
+ ) ;
892
+ assert_oks ! ( for lex and buf ;
893
+ Token :: DoctypeStart
894
+ Token :: MarkupDeclarationStart
895
+ Token :: TagEnd
896
+ Token :: CommentStart
897
+ Token :: Whitespace ( ' ' )
898
+ Token :: Character ( '<' )
899
+ Token :: Character ( '?' )
900
+ Token :: Character ( 'n' )
901
+ Token :: Character ( 'o' )
902
+ Token :: Character ( 'n' )
903
+ Token :: Character ( '?' )
904
+ Token :: Character ( '>' )
905
+ Token :: CommentEnd
906
+ Token :: ProcessingInstructionStart
907
+ Token :: Character ( 'p' )
908
+ Token :: Character ( 'i' )
909
+ Token :: Whitespace ( ' ' )
910
+ Token :: TagEnd // not really
911
+ Token :: Whitespace ( ' ' )
912
+ Token :: ProcessingInstructionEnd
913
+ Token :: TagEnd // DTD
914
+ ) ;
915
+ assert_none ! ( for lex and buf) ;
916
+ }
917
+
841
918
#[ test]
842
919
fn end_of_stream_handling_ok ( ) {
843
920
macro_rules! eof_check(
@@ -872,7 +949,6 @@ mod tests {
872
949
eof_check ! ( "<![CDA" ; 0 , 6 ) ;
873
950
eof_check ! ( "<![CDAT" ; 0 , 7 ) ;
874
951
eof_check ! ( "<![CDATA" ; 0 , 8 ) ;
875
- // eof_check!("--" ; 0, 2);
876
952
}
877
953
878
954
#[ test]
0 commit comments