Skip to content

Commit 1139a51

Browse files
committed
HTML API: Parse DOCTYPE tokens and set HTML parser mode accordingly.
This patch adds until-now missing code to parse the structure of HTML DOCTYPE declarations. The DOCTYPE is mostly unused but can dictate the document compatability mode, which governs whether CSS class names match in a ASCII-case-insensitive way or not, and whether TABLE elements close an open P element. The DOCTYPE information is made available through a new method on the Tag Processor, `get_doctype_info()`. Developed in #7195 Discussed in https://core.trac.wordpress.org/ticket/61576 Props dmsnell, jonsurrell. See #61576. git-svn-id: https://develop.svn.wordpress.org/trunk@58925 602fd350-edb4-49c9-b593-d223f7449a82
1 parent b515d22 commit 1139a51

8 files changed

+792
-19
lines changed

phpcs.xml.dist

+1
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@
262262
in the parsing, and distance the code from its standard. -->
263263
<rule ref="Generic.PHP.DiscourageGoto.Found">
264264
<exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
265+
<exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern>
265266
</rule>
266267

267268
<!-- Exclude sample config from modernization to prevent breaking CI workflows based on WP-CLI scaffold.

src/wp-includes/html-api/class-wp-html-doctype-info.php

+616
Large diffs are not rendered by default.

src/wp-includes/html-api/class-wp-html-processor.php

+5-7
Original file line numberDiff line numberDiff line change
@@ -1076,26 +1076,24 @@ private function step_initial(): bool {
10761076
* > A DOCTYPE token
10771077
*/
10781078
case 'html':
1079-
$contents = $this->get_modifiable_text();
1080-
if ( ' html' !== $contents ) {
1081-
/*
1082-
* @todo When the HTML Tag Processor fully parses the DOCTYPE declaration,
1083-
* this code should examine the contents to set the compatability mode.
1084-
*/
1085-
$this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' );
1079+
$doctype = $this->get_doctype_info();
1080+
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
1081+
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
10861082
}
10871083

10881084
/*
10891085
* > Then, switch the insertion mode to "before html".
10901086
*/
10911087
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
1088+
$this->insert_html_element( $this->state->current_token );
10921089
return true;
10931090
}
10941091

10951092
/*
10961093
* > Anything else
10971094
*/
10981095
initial_anything_else:
1096+
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
10991097
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
11001098
return $this->step( self::REPROCESS_CURRENT_NODE );
11011099
}

src/wp-includes/html-api/class-wp-html-tag-processor.php

+22-1
Original file line numberDiff line numberDiff line change
@@ -4026,6 +4026,27 @@ private function matches(): bool {
40264026
return true;
40274027
}
40284028

4029+
/**
4030+
* Gets DOCTYPE declaration info from a DOCTYPE token.
4031+
*
4032+
* DOCTYPE tokens may appear in many places in an HTML document. In most places, they are
4033+
* simply ignored. The main parsing functions find the basic shape of DOCTYPE tokens but
4034+
* do not perform detailed parsing.
4035+
*
4036+
* This method can be called to perform a full parse of the DOCTYPE token and retrieve
4037+
* its information.
4038+
*
4039+
* @return WP_HTML_Doctype_Info|null The DOCTYPE declaration information or `null` if not
4040+
* currently at a DOCTYPE node.
4041+
*/
4042+
public function get_doctype_info(): ?WP_HTML_Doctype_Info {
4043+
if ( self::STATE_DOCTYPE !== $this->parser_state ) {
4044+
return null;
4045+
}
4046+
4047+
return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) );
4048+
}
4049+
40294050
/**
40304051
* Parser Ready State.
40314052
*
@@ -4117,7 +4138,7 @@ private function matches(): bool {
41174138

41184139
/**
41194140
* Indicates that the parser has found a DOCTYPE node and it's
4120-
* possible to read and modify its modifiable text.
4141+
* possible to read its DOCTYPE information via `get_doctype_info()`.
41214142
*
41224143
* @since 6.5.0
41234144
*

src/wp-settings.php

+1
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@
252252
require ABSPATH . WPINC . '/html-api/html5-named-character-references.php';
253253
require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php';
254254
require ABSPATH . WPINC . '/html-api/class-wp-html-span.php';
255+
require ABSPATH . WPINC . '/html-api/class-wp-html-doctype-info.php';
255256
require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php';
256257
require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php';
257258
require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php';
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
<?php
2+
/**
3+
* Unit tests covering WP_HTML_Doctype_Info functionality.
4+
*
5+
* @package WordPress
6+
* @subpackage HTML-API
7+
*/
8+
9+
/**
10+
* @group html-api
11+
*
12+
* @coversDefaultClass WP_HTML_Doctype_Info
13+
*/
14+
class Tests_HtmlApi_WpHtmlDoctypeInfo extends WP_UnitTestCase {
15+
/**
16+
* Test DOCTYPE handling.
17+
*
18+
* @ticket 61576
19+
*
20+
* @dataProvider data_parseable_raw_doctypes
21+
*/
22+
public function test_doctype_doc_info(
23+
string $html,
24+
string $expected_compat_mode,
25+
?string $expected_name = null,
26+
?string $expected_public_id = null,
27+
?string $expected_system_id = null
28+
) {
29+
$doctype = WP_HTML_Doctype_Info::from_doctype_token( $html );
30+
$this->assertNotNull(
31+
$doctype,
32+
"Should have parsed the following doctype declaration: {$html}"
33+
);
34+
35+
$this->assertSame(
36+
$expected_compat_mode,
37+
$doctype->indicated_compatability_mode,
38+
'Failed to infer the expected document compatability mode.'
39+
);
40+
41+
$this->assertSame(
42+
$expected_name,
43+
$doctype->name,
44+
'Failed to parse the expected DOCTYPE name.'
45+
);
46+
47+
$this->assertSame(
48+
$expected_public_id,
49+
$doctype->public_identifier,
50+
'Failed to parse the expected DOCTYPE public identifier.'
51+
);
52+
53+
$this->assertSame(
54+
$expected_system_id,
55+
$doctype->system_identifier,
56+
'Failed to parse the expected DOCTYPE system identifier.'
57+
);
58+
}
59+
60+
/**
61+
* Data provider.
62+
*
63+
* @return array[]
64+
*/
65+
public static function data_parseable_raw_doctypes(): array {
66+
return array(
67+
'Missing doctype name' => array( '<!DOCTYPE>', 'quirks' ),
68+
'HTML5 doctype' => array( '<!DOCTYPE html>', 'no-quirks', 'html' ),
69+
'HTML5 doctype no whitespace before name' => array( '<!DOCTYPEhtml>', 'no-quirks', 'html' ),
70+
'XHTML doctype' => array( '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">', 'no-quirks', 'html', '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' ),
71+
'SVG doctype' => array( '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">', 'quirks', 'svg', '-//W3C//DTD SVG 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' ),
72+
'MathML doctype' => array( '<!DOCTYPE math PUBLIC "-//W3C//DTD MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/mathml2.dtd">', 'quirks', 'math', '-//W3C//DTD MathML 2.0//EN', 'http://www.w3.org/Math/DTD/mathml2/mathml2.dtd' ),
73+
'Doctype with null byte replacement' => array( "<!DOCTYPE null-\0 PUBLIC '\0' '\0\0'>", 'quirks', "null-\u{FFFD}", "\u{FFFD}", "\u{FFFD}\u{FFFD}" ),
74+
'Uppercase doctype' => array( '<!DOCTYPE UPPERCASE>', 'quirks', 'uppercase' ),
75+
'Lowercase doctype' => array( '<!doctype lowercase>', 'quirks', 'lowercase' ),
76+
'Doctype with whitespace' => array( "<!DOCTYPE\n\thtml\f\rPUBLIC\r\n''\t''>", 'no-quirks', 'html', '', '' ),
77+
'Doctype trailing characters' => array( "<!DOCTYPE html PUBLIC '' '' Anything (except closing angle bracket) is just fine here !!!>", 'no-quirks', 'html', '', '' ),
78+
'An ugly no-quirks doctype' => array( "<!dOcTyPehtml\tPublIC\"pub-id\"'sysid'>", 'no-quirks', 'html', 'pub-id', 'sysid' ),
79+
'Missing public ID' => array( '<!DOCTYPE html PUBLIC>', 'quirks', 'html' ),
80+
'Missing system ID' => array( '<!DOCTYPE html SYSTEM>', 'quirks', 'html' ),
81+
'Missing close quote public ID' => array( "<!DOCTYPE html PUBLIC 'xyz>", 'quirks', 'html', 'xyz' ),
82+
'Missing close quote system ID' => array( "<!DOCTYPE html SYSTEM 'xyz>", 'quirks', 'html', null, 'xyz' ),
83+
'Missing close quote system ID with public' => array( "<!DOCTYPE html PUBLIC 'abc' 'xyz>", 'quirks', 'html', 'abc', 'xyz' ),
84+
'Bogus characters instead of system/public' => array( '<!DOCTYPE html FOOBAR>', 'quirks', 'html' ),
85+
'Bogus characters instead of PUBLIC quote' => array( "<!DOCTYPE html PUBLIC x ''''>", 'quirks', 'html' ),
86+
'Bogus characters instead of SYSTEM quote ' => array( "<!DOCTYPE html SYSTEM x ''>", 'quirks', 'html' ),
87+
'Emoji' => array( '<!DOCTYPE 🏴󠁧󠁢󠁥󠁮󠁧󠁿 PUBLIC "🔥" "😈">', 'quirks', "\u{1F3F4}\u{E0067}\u{E0062}\u{E0065}\u{E006E}\u{E0067}\u{E007F}", '🔥', '😈' ),
88+
'Bogus characters instead of SYSTEM quote after public' => array( "<!DOCTYPE html PUBLIC ''x''>", 'quirks', 'html', '' ),
89+
'Special quirks mode if system unset' => array( '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//">', 'quirks', 'html', '-//W3C//DTD HTML 4.01 Frameset//' ),
90+
'Special limited-quirks mode if system set' => array( '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//" "">', 'limited-quirks', 'html', '-//W3C//DTD HTML 4.01 Frameset//', '' ),
91+
);
92+
}
93+
94+
/**
95+
* @dataProvider invalid_inputs
96+
*
97+
* @ticket 61576
98+
*/
99+
public function test_invalid_inputs_return_null( string $html ) {
100+
$this->assertNull( WP_HTML_Doctype_Info::from_doctype_token( $html ) );
101+
}
102+
103+
/**
104+
* Data provider.
105+
*
106+
* @return array[]
107+
*/
108+
public static function invalid_inputs(): array {
109+
return array(
110+
'Empty string' => array( '' ),
111+
'Other HTML' => array( '<div>' ),
112+
'DOCTYPE after HTML' => array( 'x<!DOCTYPE>' ),
113+
'DOCTYPE before HTML' => array( '<!DOCTYPE>x' ),
114+
'Incomplete DOCTYPE' => array( '<!DOCTYPE' ),
115+
'Pseudo DOCTYPE containing ">"' => array( '<!DOCTYPE html PUBLIC ">">' ),
116+
);
117+
}
118+
}

tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php

+13-11
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
2727
const SKIP_TESTS = array(
2828
'comments01/line0155' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
2929
'comments01/line0169' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
30+
'doctype01/line0380' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
3031
'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
3132
'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
3233
'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
@@ -115,7 +116,7 @@ public function data_external_html5lib_tests() {
115116

116117
$test_context_element = $test[1];
117118

118-
if ( self::should_skip_test( $test_context_element, $test_name, $test[3] ) ) {
119+
if ( self::should_skip_test( $test_context_element, $test_name ) ) {
119120
continue;
120121
}
121122

@@ -133,7 +134,7 @@ public function data_external_html5lib_tests() {
133134
*
134135
* @return bool True if the test case should be skipped. False otherwise.
135136
*/
136-
private static function should_skip_test( ?string $test_context_element, string $test_name, string $expected_tree ): bool {
137+
private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
137138
if ( null !== $test_context_element && 'body' !== $test_context_element ) {
138139
return true;
139140
}
@@ -189,6 +190,15 @@ private static function build_tree_representation( ?string $fragment_context, st
189190
}
190191

191192
switch ( $token_type ) {
193+
case '#doctype':
194+
$doctype = $processor->get_doctype_info();
195+
$output .= "<!DOCTYPE {$doctype->name}";
196+
if ( null !== $doctype->public_identifier || null !== $doctype->system_identifier ) {
197+
$output .= " \"{$doctype->public_identifier}\" \"{$doctype->system_identifier}\"";
198+
}
199+
$output .= ">\n";
200+
break;
201+
192202
case '#tag':
193203
$namespace = $processor->get_namespace();
194204
$tag_name = 'html' === $namespace
@@ -450,15 +460,7 @@ public static function parse_html5_dat_testfile( $filename ) {
450460
*/
451461
case 'document':
452462
if ( '|' === $line[0] ) {
453-
/*
454-
* The next_token() method these tests rely on do not stop
455-
* at doctype nodes. Strip doctypes from output.
456-
* @todo Restore this line if and when the processor
457-
* exposes doctypes.
458-
*/
459-
if ( '| <!DOCTYPE ' !== substr( $line, 0, 12 ) ) {
460-
$test_dom .= substr( $line, 2 );
461-
}
463+
$test_dom .= substr( $line, 2 );
462464
} else {
463465
// This is a text node that includes unescaped newlines.
464466
// Everything else should be singles lines starting with "| ".

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

+16
Original file line numberDiff line numberDiff line change
@@ -2939,4 +2939,20 @@ public function test_unclosed_funky_comment_input_too_short() {
29392939
$this->assertFalse( $processor->next_tag() );
29402940
$this->assertTrue( $processor->paused_at_incomplete_token() );
29412941
}
2942+
2943+
/**
2944+
* Test basic DOCTYPE handling.
2945+
*
2946+
* @ticket 61576
2947+
*/
2948+
public function test_doctype_doc_name() {
2949+
$processor = new WP_HTML_Tag_Processor( '<!DOCTYPE html>' );
2950+
$this->assertTrue( $processor->next_token() );
2951+
$doctype = $processor->get_doctype_info();
2952+
$this->assertNotNull( $doctype );
2953+
$this->assertSame( 'html', $doctype->name );
2954+
$this->assertSame( 'no-quirks', $doctype->indicated_compatability_mode );
2955+
$this->assertNull( $doctype->public_identifier );
2956+
$this->assertNull( $doctype->system_identifier );
2957+
}
29422958
}

0 commit comments

Comments
 (0)