From 6cdc1cec09860c1c9c498a48328c59443c0404c1 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 22 Jul 2024 22:24:15 +0000 Subject: [PATCH] HTML API: Add missing tags in IN BODY insertion mode to HTML Processor. As part of work to add more spec support to the HTML API, this patch adds support for the remaining missing tags in the IN BODY insertion mode. Not all of the added tags are supported, because in some cases they reset the insertion mode and are reprocessed where they will be rejected. This patch also improves the support of `get_modifiable_text()`, removing a leading newline inside a LISTING, PRE, or TEXTAREA element. Developed in https://github.com/WordPress/wordpress-develop/pull/6972 Discussed in https://core.trac.wordpress.org/ticket/61576 Props dmsnell, jonsurrell, westonruter. See #61576. Built from https://develop.svn.wordpress.org/trunk@58779 git-svn-id: https://core.svn.wordpress.org/trunk@58181 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- ...ass-wp-html-active-formatting-elements.php | 42 + .../html-api/class-wp-html-open-elements.php | 181 +++- .../class-wp-html-processor-state.php | 67 ++ .../html-api/class-wp-html-processor.php | 776 +++++++++++++----- .../html-api/class-wp-html-tag-processor.php | 129 ++- wp-includes/html-api/class-wp-html-token.php | 5 +- wp-includes/version.php | 2 +- 7 files changed, 962 insertions(+), 240 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 69e34dca49..2f51482eee 100644 --- a/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -86,6 +86,22 @@ public function current_node() { return $current_node ? $current_node : null; } + /** + * Inserts a "marker" at the end of the list of active formatting elements. + * + * > The markers are inserted when entering applet, object, marquee, + * > template, td, th, and caption elements, and are used to prevent + * > formatting from "leaking" into applet, object, marquee, template, + * > td, th, and caption elements. + * + * @see https://html.spec.whatwg.org/#concept-parser-marker + * + * @since 6.7.0 + */ + public function insert_marker(): void { + $this->push( new WP_HTML_Token( null, 'marker', false ) ); + } + /** * Pushes a node onto the stack of active formatting elements. * @@ -184,4 +200,30 @@ public function walk_up() { yield $this->stack[ $i ]; } } + + /** + * Clears the list of active formatting elements up to the last marker. + * + * > When the steps below require the UA to clear the list of active formatting elements up to + * > the last marker, the UA must perform the following steps: + * > + * > 1. Let entry be the last (most recently added) entry in the list of active + * > formatting elements. + * > 2. Remove entry from the list of active formatting elements. + * > 3. If entry was a marker, then stop the algorithm at this point. + * > The list has been cleared up to the last marker. + * > 4. Go to step 1. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#clear-the-list-of-active-formatting-elements-up-to-the-last-marker + * + * @since 6.7.0 + */ + public function clear_up_to_last_marker(): void { + foreach ( $this->walk_up() as $item ) { + array_pop( $this->stack ); + if ( 'marker' === $item->node_name ) { + break; + } + } + } } diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php index 065bbd25c9..d59bd32140 100644 --- a/wp-includes/html-api/class-wp-html-open-elements.php +++ b/wp-includes/html-api/class-wp-html-open-elements.php @@ -101,6 +101,49 @@ public function set_push_handler( Closure $handler ): void { $this->push_handler = $handler; } + /** + * Returns the name of the node at the nth position on the stack + * of open elements, or `null` if no such position exists. + * + * Note that this uses a 1-based index, which represents the + * "nth item" on the stack, counting from the top, where the + * top-most element is the 1st, the second is the 2nd, etc... + * + * @since 6.7.0 + * + * @param int $nth Retrieve the nth item on the stack, with 1 being + * the top element, 2 being the second, etc... + * @return string|null Name of the node on the stack at the given location, + * or `null` if the location isn't on the stack. + */ + public function at( int $nth ): ?string { + foreach ( $this->walk_down() as $item ) { + if ( 0 === --$nth ) { + return $item->node_name; + } + } + + return null; + } + + /** + * Reports if a node of a given name is in the stack of open elements. + * + * @since 6.7.0 + * + * @param string $node_name Name of node for which to check. + * @return bool Whether a node of the given name is in the stack of open elements. + */ + public function contains( string $node_name ): bool { + foreach ( $this->walk_up() as $item ) { + if ( $node_name === $item->node_name ) { + return true; + } + } + + return false; + } + /** * Reports if a specific node is in the stack of open elements. * @@ -111,7 +154,7 @@ public function set_push_handler( Closure $handler ): void { */ public function contains_node( WP_HTML_Token $token ): bool { foreach ( $this->walk_up() as $item ) { - if ( $token->bookmark_name === $item->bookmark_name ) { + if ( $token === $item ) { return true; } } @@ -210,11 +253,6 @@ public function has_element_in_specific_scope( string $tag_name, $termination_li return true; } - switch ( $node->node_name ) { - case 'HTML': - return false; - } - if ( in_array( $node->node_name, $termination_list, true ) ) { return false; } @@ -226,7 +264,31 @@ public function has_element_in_specific_scope( string $tag_name, $termination_li /** * Returns whether a particular element is in scope. * + * > The stack of open elements is said to have a particular element in + * > scope when it has that element in the specific scope consisting of + * > the following element types: + * > + * > - applet + * > - caption + * > - html + * > - table + * > - td + * > - th + * > - marquee + * > - object + * > - template + * > - MathML mi + * > - MathML mo + * > - MathML mn + * > - MathML ms + * > - MathML mtext + * > - MathML annotation-xml + * > - SVG foreignObject + * > - SVG desc + * > - SVG title + * * @since 6.4.0 + * @since 6.7.0 Supports all required HTML elements. * * @see https://html.spec.whatwg.org/#has-an-element-in-scope * @@ -237,14 +299,16 @@ public function has_element_in_scope( string $tag_name ): bool { return $this->has_element_in_specific_scope( $tag_name, array( - - /* - * Because it's not currently possible to encounter - * one of the termination elements, they don't need - * to be listed here. If they were, they would be - * unreachable and only waste CPU cycles while - * scanning through HTML. - */ + 'APPLET', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', + // @todo: Support SVG and MathML nodes when support for foreign content is added. ) ); } @@ -252,8 +316,17 @@ public function has_element_in_scope( string $tag_name ): bool { /** * Returns whether a particular element is in list item scope. * + * > The stack of open elements is said to have a particular element + * > in list item scope when it has that element in the specific scope + * > consisting of the following element types: + * > + * > - All the element types listed above for the has an element in scope algorithm. + * > - ol in the HTML namespace + * > - ul in the HTML namespace + * * @since 6.4.0 * @since 6.5.0 Implemented: no longer throws on every invocation. + * @since 6.7.0 Supports all required HTML elements. * * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope * @@ -264,9 +337,19 @@ public function has_element_in_list_item_scope( string $tag_name ): bool { return $this->has_element_in_specific_scope( $tag_name, array( - // There are more elements that belong here which aren't currently supported. + 'APPLET', + 'BUTTON', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', 'OL', + 'TEMPLATE', 'UL', + // @todo: Support SVG and MathML nodes when support for foreign content is added. ) ); } @@ -274,7 +357,15 @@ public function has_element_in_list_item_scope( string $tag_name ): bool { /** * Returns whether a particular element is in button scope. * + * > The stack of open elements is said to have a particular element + * > in button scope when it has that element in the specific scope + * > consisting of the following element types: + * > + * > - All the element types listed above for the has an element in scope algorithm. + * > - button in the HTML namespace + * * @since 6.4.0 + * @since 6.7.0 Supports all required HTML elements. * * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope * @@ -282,25 +373,52 @@ public function has_element_in_list_item_scope( string $tag_name ): bool { * @return bool Whether given element is in scope. */ public function has_element_in_button_scope( string $tag_name ): bool { - return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) ); + return $this->has_element_in_specific_scope( + $tag_name, + array( + 'APPLET', + 'BUTTON', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', + // @todo: Support SVG and MathML nodes when support for foreign content is added. + ) + ); } /** * Returns whether a particular element is in table scope. * + * > The stack of open elements is said to have a particular element + * > in table scope when it has that element in the specific scope + * > consisting of the following element types: + * > + * > - html in the HTML namespace + * > - table in the HTML namespace + * > - template in the HTML namespace + * * @since 6.4.0 + * @since 6.7.0 Full implementation. * * @see https://html.spec.whatwg.org/#has-an-element-in-table-scope * - * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. - * * @param string $tag_name Name of tag to check. * @return bool Whether given element is in scope. */ public function has_element_in_table_scope( string $tag_name ): bool { - throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' ); - - return false; // The linter requires this unreachable code until the function is implemented and can return. + return $this->has_element_in_specific_scope( + $tag_name, + array( + 'HTML', + 'TABLE', + 'TEMPLATE', + ) + ); } /** @@ -540,7 +658,16 @@ public function after_element_push( WP_HTML_Token $item ): void { * cases where the precalculated value needs to change. */ switch ( $item->node_name ) { + case 'APPLET': case 'BUTTON': + case 'CAPTION': + case 'HTML': + case 'TABLE': + case 'TD': + case 'TH': + case 'MARQUEE': + case 'OBJECT': + case 'TEMPLATE': $this->has_p_in_button_scope = false; break; @@ -573,11 +700,17 @@ public function after_element_pop( WP_HTML_Token $item ): void { * cases where the precalculated value needs to change. */ switch ( $item->node_name ) { + case 'APPLET': case 'BUTTON': - $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); - break; - + case 'CAPTION': + case 'HTML': case 'P': + case 'TABLE': + case 'TD': + case 'TH': + case 'MARQUEE': + case 'OBJECT': + case 'TEMPLATE': $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); break; } diff --git a/wp-includes/html-api/class-wp-html-processor-state.php b/wp-includes/html-api/class-wp-html-processor-state.php index eadfe30d26..e0469bea02 100644 --- a/wp-includes/html-api/class-wp-html-processor-state.php +++ b/wp-includes/html-api/class-wp-html-processor-state.php @@ -311,6 +311,31 @@ class WP_HTML_Processor_State { */ const INSERTION_MODE_IN_FOREIGN_CONTENT = 'insertion-mode-in-foreign-content'; + /** + * No-quirks mode document compatability mode. + * + * > In no-quirks mode, the behavior is (hopefully) the desired behavior + * > described by the modern HTML and CSS specifications. + * + * @since 6.7.0 + * + * @var string + */ + const NO_QUIRKS_MODE = 'no-quirks-mode'; + + /** + * Quirks mode document compatability mode. + * + * > In quirks mode, layout emulates behavior in Navigator 4 and Internet + * > Explorer 5. This is essential in order to support websites that were + * > built before the widespread adoption of web standards. + * + * @since 6.7.0 + * + * @var string + */ + const QUIRKS_MODE = 'quirks-mode'; + /** * The stack of template insertion modes. * @@ -368,6 +393,30 @@ class WP_HTML_Processor_State { */ public $insertion_mode = self::INSERTION_MODE_INITIAL; + /** + * Indicates if the document is in quirks mode or no-quirks mode. + * + * Impact on HTML parsing: + * + * - In `NO_QUIRKS_MODE` CSS class and ID selectors match in a byte-for-byte + * manner, otherwise for backwards compatability, class selectors are to + * match in an ASCII case-insensitive manner. + * + * - When not in `QUIRKS_MODE`, a TABLE start tag implicitly closes an open P tag + * if one is in scope and open, otherwise the TABLE becomes a child of the P. + * + * `QUIRKS_MODE` impacts many styling-related aspects of an HTML document, but + * none of the other changes modifies how the HTML is parsed or selected. + * + * @see self::QUIRKS_MODE + * @see self::NO_QUIRKS_MODE + * + * @since 6.7.0 + * + * @var string + */ + public $document_mode = self::NO_QUIRKS_MODE; + /** * Context node initializing fragment parser, if created as a fragment parser. * @@ -390,6 +439,24 @@ class WP_HTML_Processor_State { */ public $head_element = null; + /** + * FORM element pointer. + * + * > points to the last form element that was opened and whose end tag has + * > not yet been seen. It is used to make form controls associate with + * > forms in the face of dramatically bad markup, for historical reasons. + * > It is ignored inside template elements. + * + * @todo This may be invalidated by a seek operation. + * + * @see https://html.spec.whatwg.org/#form-element-pointer + * + * @since 6.7.0 + * + * @var WP_HTML_Token|null + */ + public $form_element = null; + /** * The frameset-ok flag indicates if a `FRAMESET` element is allowed in the current state. * diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index 72f39d3ad7..d614112a76 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -97,22 +97,11 @@ * will abort early and stop all processing. This draconian measure ensures * that the HTML Processor won't break any HTML it doesn't fully understand. * - * The following list specifies the HTML tags that _are_ supported: + * The HTML Processor supports all elements other than a specific set: * - * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. - * - Custom elements: All custom elements are supported. :) - * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, OPTGROUP, OPTION, PROGRESS, SEARCH, SELECT. - * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. - * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. - * - Links: A. - * - Lists: DD, DL, DT, LI, OL, UL. - * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. - * - Paragraph: BR, P. - * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. - * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. - * - Templating elements: SLOT. - * - Text decoration: RUBY. - * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. + * - Any element inside a TABLE. + * - Any element inside foreign content, including SVG and MATH. + * - Any element outside the IN BODY insertion mode, e.g. doctype declarations, meta, links. * * ### Supported markup * @@ -121,15 +110,30 @@ * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters * such a case it will stop processing. * - * The following list specifies HTML markup that _is_ supported: + * The following list illustrates some common examples of unexpected HTML inputs that + * the HTML Processor properly parses and represents: * - * - Markup involving only those tags listed above. - * - Fully-balanced and non-overlapping tags. - * - HTML with unexpected tag closers. - * - Some unbalanced or overlapping tags. - * - P tags after unclosed P tags. - * - BUTTON tags after unclosed BUTTON tags. - * - A tags after unclosed A tags that don't involve any active formatting elements. + * - HTML with optional tags omitted, e.g. `

one

two`. + * - HTML with unexpected tag closers, e.g. `

one more

`. + * - Non-void tags with self-closing flag, e.g. `
the DIV is still open.
`. + * - Heading elements which close open heading elements of another level, e.g. `

Closed by

`. + * - Elements containing text that looks like other tags but isn't, e.g. `The <img> is plaintext`. + * - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. ``. + * - SCRIPT content which has been escaped, e.g. ``. + * + * ### Unsupported Features + * + * This parser does not report parse errors. + * + * Normally, when additional HTML or BODY tags are encountered in a document, if there + * are any additional attributes on them that aren't found on the previous elements, + * the existing HTML and BODY elements adopt those missing attribute values. This + * parser does not add those additional attributes. + * + * In certain situations, elements are moved to a different part of the document in + * a process called "adoption" and "fostering." Because the nodes move to a location + * in the document that the parser had already processed, this parser does not support + * these situations and will bail. * * @since 6.4.0 * @@ -1104,15 +1108,7 @@ private function step_in_body(): bool { $op = "{$op_sigil}{$token_name}"; switch ( $op ) { - case '#comment': - case '#funky-comment': - case '#presumptuous-tag': - $this->insert_html_element( $this->state->current_token ); - return true; - case '#text': - $this->reconstruct_active_formatting_elements(); - $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; /* @@ -1133,6 +1129,8 @@ private function step_in_body(): bool { return $this->step(); } + $this->reconstruct_active_formatting_elements(); + /* * Whitespace-only text does not affect the frameset-ok flag. * It is probably inter-element whitespace, but it may also @@ -1146,29 +1144,146 @@ private function step_in_body(): bool { $this->insert_html_element( $this->state->current_token ); return true; + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ case 'html': + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + /* + * > Otherwise, for each attribute on the token, check to see if the attribute + * > is already present on the top element of the stack of open elements. If + * > it is not, add the attribute and its corresponding value to that element. + * + * This parser does not currently support this behavior: ignore the token. + */ + } + + // Ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link", + * > "meta", "noframes", "script", "style", "template", "title" + * > + * > An end tag whose tag name is "template" + */ + case '+BASE': + case '+BASEFONT': + case '+BGSOUND': + case '+LINK': + case '+META': + case '+NOFRAMES': + case '+SCRIPT': + case '+STYLE': + case '+TEMPLATE': + case '+TITLE': + case '-TEMPLATE': + return $this->step_in_head(); + + /* + * > A start tag whose tag name is "body" + * + * This tag in the IN BODY insertion mode is a parse error. + */ + case '+BODY': + if ( + 1 === $this->state->stack_of_open_elements->count() || + 'BODY' !== $this->state->stack_of_open_elements->at( 2 ) || + $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) + ) { + // Ignore the token. + return $this->step(); + } + /* - * > A DOCTYPE token - * > Parse error. Ignore the token. + * > Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute + * > on the token, check to see if the attribute is already present on the body + * > element (the second element) on the stack of open elements, and if it is + * > not, add the attribute and its corresponding value to that element. + * + * This parser does not currently support this behavior: ignore the token. */ + $this->state->frameset_ok = false; return $this->step(); /* - * > A start tag whose tag name is "button" + * > A start tag whose tag name is "frameset" + * + * This tag in the IN BODY insertion mode is a parse error. */ - case '+BUTTON': - if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { - // @todo Indicate a parse error once it's possible. This error does not impact the logic here. - $this->generate_implied_end_tags(); - $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); + case '+FRAMESET': + if ( + 1 === $this->state->stack_of_open_elements->count() || + 'BODY' !== $this->state->stack_of_open_elements->at( 2 ) || + false === $this->state->frameset_ok + ) { + // Ignore the token. + return $this->step(); } - $this->reconstruct_active_formatting_elements(); - $this->insert_html_element( $this->state->current_token ); - $this->state->frameset_ok = false; + /* + * > Otherwise, run the following steps: + */ + $this->bail( 'Cannot process non-ignored FRAMESET tags.' ); + break; + /* + * > An end tag whose tag name is "body" + */ + case '-BODY': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Otherwise, if there is a node in the stack of open elements that is not either a + * > dd element, a dt element, an li element, an optgroup element, an option element, + * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody + * > element, a td element, a tfoot element, a th element, a thread element, a tr + * > element, the body element, or the html element, then this is a parse error. + * + * There is nothing to do for this parse error, so don't check for it. + */ + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY; return true; + /* + * > An end tag whose tag name is "html" + */ + case '-HTML': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'BODY' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * > Otherwise, if there is a node in the stack of open elements that is not either a + * > dd element, a dt element, an li element, an optgroup element, an option element, + * > a p element, an rb element, an rp element, an rt element, an rtc element, a tbody + * > element, a td element, a tfoot element, a th element, a thread element, a tr + * > element, the body element, or the html element, then this is a parse error. + * + * There is nothing to do for this parse error, so don't check for it. + */ + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); + /* * > A start tag whose tag name is one of: "address", "article", "aside", * > "blockquote", "center", "details", "dialog", "dir", "div", "dl", @@ -1207,52 +1322,6 @@ private function step_in_body(): bool { $this->insert_html_element( $this->state->current_token ); return true; - /* - * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", - * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", - * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", - * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" - */ - case '-ADDRESS': - case '-ARTICLE': - case '-ASIDE': - case '-BLOCKQUOTE': - case '-BUTTON': - case '-CENTER': - case '-DETAILS': - case '-DIALOG': - case '-DIR': - case '-DIV': - case '-DL': - case '-FIELDSET': - case '-FIGCAPTION': - case '-FIGURE': - case '-FOOTER': - case '-HEADER': - case '-HGROUP': - case '-LISTING': - case '-MAIN': - case '-MENU': - case '-NAV': - case '-OL': - case '-PRE': - case '-SEARCH': - case '-SECTION': - case '-SUMMARY': - case '-UL': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { - // @todo Report parse error. - // Ignore the token. - return $this->step(); - } - - $this->generate_implied_end_tags(); - if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { - // @todo Record parse error: this error doesn't impact parsing. - } - $this->state->stack_of_open_elements->pop_until( $token_name ); - return true; - /* * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ @@ -1288,35 +1357,39 @@ private function step_in_body(): bool { if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } + + /* + * > If the next token is a U+000A LINE FEED (LF) character token, + * > then ignore that token and move on to the next one. (Newlines + * > at the start of pre blocks are ignored as an authoring convenience.) + * + * This is handled in `get_modifiable_text()`. + */ + $this->insert_html_element( $this->state->current_token ); $this->state->frameset_ok = false; return true; /* - * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + * > A start tag whose tag name is "form" */ - case '-H1': - case '-H2': - case '-H3': - case '-H4': - case '-H5': - case '-H6': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { - /* - * This is a parse error; ignore the token. - * - * @todo Indicate a parse error once it's possible. - */ + case '+FORM': + $stack_contains_template = $this->state->stack_of_open_elements->contains( 'TEMPLATE' ); + + if ( isset( $this->state->form_element ) && ! $stack_contains_template ) { + // Parse error: ignore the token. return $this->step(); } - $this->generate_implied_end_tags(); + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } - if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { - // @todo Record parse error: this error doesn't impact parsing. + $this->insert_html_element( $this->state->current_token ); + if ( ! $stack_contains_template ) { + $this->state->form_element = $this->state->current_token; } - $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); return true; /* @@ -1377,6 +1450,150 @@ private function step_in_body(): bool { $this->insert_html_element( $this->state->current_token ); return true; + case '+PLAINTEXT': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + /* + * @todo This may need to be handled in the Tag Processor and turn into + * a single self-contained tag like TEXTAREA, whose modifiable text + * is the rest of the input document as plaintext. + */ + $this->bail( 'Cannot process PLAINTEXT elements.' ); + break; + + /* + * > A start tag whose tag name is "button" + */ + case '+BUTTON': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + $this->generate_implied_end_tags(); + $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + + return true; + + /* + * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", + * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", + * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", + * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" + * + * @todo This needs to check if the element in scope is an HTML element, meaning that + * when SVG and MathML support is added, this needs to differentiate between an + * HTML element of the given name, such as `
`, and a foreign element of + * the same given name. + */ + case '-ADDRESS': + case '-ARTICLE': + case '-ASIDE': + case '-BLOCKQUOTE': + case '-BUTTON': + case '-CENTER': + case '-DETAILS': + case '-DIALOG': + case '-DIR': + case '-DIV': + case '-DL': + case '-FIELDSET': + case '-FIGCAPTION': + case '-FIGURE': + case '-FOOTER': + case '-HEADER': + case '-HGROUP': + case '-LISTING': + case '-MAIN': + case '-MENU': + case '-NAV': + case '-OL': + case '-PRE': + case '-SEARCH': + case '-SECTION': + case '-SUMMARY': + case '-UL': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { + // @todo Report parse error. + // Ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { + // @todo Record parse error: this error doesn't impact parsing. + } + $this->state->stack_of_open_elements->pop_until( $token_name ); + return true; + + /* + * > An end tag whose tag name is "form" + */ + case '-FORM': + if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { + $node = $this->state->form_element; + $this->state->form_element = null; + + /* + * > If node is null or if the stack of open elements does not have node + * > in scope, then this is a parse error; return and ignore the token. + * + * @todo It's necessary to check if the form token itself is in scope, not + * simply whether any FORM is in scope. + */ + if ( + null === $node || + ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) + ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( $node !== $this->state->stack_of_open_elements->current_node() ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + $this->bail( 'Cannot close a FORM when other elements remain open as this would throw off the breadcrumbs for the following tokens.' ); + } + + $this->state->stack_of_open_elements->remove_node( $node ); + } else { + /* + * > If the stack of open elements does not have a form element in scope, + * > then this is a parse error; return and ignore the token. + * + * Note that unlike in the clause above, this is checking for any FORM in scope. + */ + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + + if ( ! $this->state->stack_of_open_elements->current_node_is( 'FORM' ) ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( 'FORM' ); + return true; + } + break; + + /* + * > An end tag whose tag name is "p" + */ + case '-P': + if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->insert_html_element( $this->state->current_token ); + } + + $this->close_a_p_element(); + return true; + /* * > An end tag whose tag name is "li" * > An end tag whose tag name is one of: "dd", "dt" @@ -1423,17 +1640,35 @@ private function step_in_body(): bool { return true; /* - * > An end tag whose tag name is "p" + * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" */ - case '-P': - if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { - $this->insert_html_element( $this->state->current_token ); + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { + /* + * This is a parse error; ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); } - $this->close_a_p_element(); + $this->generate_implied_end_tags(); + + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { + // @todo Record parse error: this error doesn't impact parsing. + } + + $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); return true; - // > A start tag whose tag name is "a" + /* + * > A start tag whose tag name is "a" + */ case '+A': foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { switch ( $item->node_name ) { @@ -1474,6 +1709,22 @@ private function step_in_body(): bool { $this->state->active_formatting_elements->push( $this->state->current_token ); return true; + /* + * > A start tag whose tag name is "nobr" + */ + case '+NOBR': + $this->reconstruct_active_formatting_elements(); + + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'NOBR' ) ) { + // Parse error. + $this->run_adoption_agency_algorithm(); + $this->reconstruct_active_formatting_elements(); + } + + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->push( $this->state->current_token ); + return true; + /* * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", * > "nobr", "s", "small", "strike", "strong", "tt", "u" @@ -1494,16 +1745,65 @@ private function step_in_body(): bool { $this->run_adoption_agency_algorithm(); return true; + /* + * > A start tag whose tag name is one of: "applet", "marquee", "object" + */ + case '+APPLET': + case '+MARQUEE': + case '+OBJECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->insert_marker(); + $this->state->frameset_ok = false; + return true; + + /* + * > A end tag token whose tag name is one of: "applet", "marquee", "object" + * + * @todo This needs to check if the element in scope is an HTML element, meaning that + * when SVG and MathML support is added, this needs to differentiate between an + * HTML element of the given name, such as ``, and a foreign element of + * the same given name. + */ + case '-APPLET': + case '-MARQUEE': + case '-OBJECT': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { + // Parse error: ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( ! $this->state->stack_of_open_elements->current_node_is( $token_name ) ) { + // This is a parse error. + } + + $this->state->stack_of_open_elements->pop_until( $token_name ); + $this->state->active_formatting_elements->clear_up_to_last_marker(); + return true; + + /* + * > A start tag whose tag name is "table" + */ + case '+TABLE': + if ( + WP_HTML_Processor_State::QUIRKS_MODE !== $this->state->document_mode && + $this->state->stack_of_open_elements->has_p_in_button_scope() + ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return true; + /* * > An end tag whose tag name is "br" - * > Parse error. Drop the attributes from the token, and act as described in the next - * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather - * > than the end tag token that it actually is. + * + * This is prevented from happening because the Tag Processor + * reports all closing BR tags as if they were opening tags. */ - case '-BR': - $this->bail( 'Closing BR tags require unimplemented special handling.' ); - // This return required because PHPCS can't determine that the call to bail() throws. - return false; /* * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" @@ -1525,15 +1825,26 @@ private function step_in_body(): bool { case '+INPUT': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $type_attribute = $this->get_attribute( 'type' ); + /* * > If the token does not have an attribute with the name "type", or if it does, * > but that attribute's value is not an ASCII case-insensitive match for the * > string "hidden", then: set the frameset-ok flag to "not ok". */ + $type_attribute = $this->get_attribute( 'type' ); if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { $this->state->frameset_ok = false; } + + return true; + + /* + * > A start tag whose tag name is one of: "param", "source", "track" + */ + case '+PARAM': + case '+SOURCE': + case '+TRACK': + $this->insert_html_element( $this->state->current_token ); return true; /* @@ -1548,11 +1859,80 @@ private function step_in_body(): bool { return true; /* - * > A start tag whose tag name is one of: "param", "source", "track" + * > A start tag whose tag name is "image" */ - case '+PARAM': - case '+SOURCE': - case '+TRACK': + case '+IMAGE': + /* + * > Parse error. Change the token's tag name to "img" and reprocess it. (Don't ask.) + * + * Note that this is handled elsewhere, so it should not be possible to reach this code. + */ + $this->bail( "Cannot process an IMAGE tag. (Don't ask.)" ); + break; + + /* + * > A start tag whose tag name is "textarea" + */ + case '+TEXTAREA': + $this->insert_html_element( $this->state->current_token ); + + /* + * > If the next token is a U+000A LINE FEED (LF) character token, then ignore + * > that token and move on to the next one. (Newlines at the start of + * > textarea elements are ignored as an authoring convenience.) + * + * This is handled in `get_modifiable_text()`. + */ + + $this->state->frameset_ok = false; + + /* + * > Switch the insertion mode to "text". + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + return true; + + /* + * > A start tag whose tag name is "xmp" + */ + case '+XMP': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->reconstruct_active_formatting_elements(); + $this->state->frameset_ok = false; + + /* + * > Follow the generic raw text element parsing algorithm. + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * A start tag whose tag name is "iframe" + */ + case '+IFRAME': + $this->state->frameset_ok = false; + + /* + * > Follow the generic raw text element parsing algorithm. + * + * As a self-contained node, this behavior is handled in the Tag Processor. + */ + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "noembed" + * > A start tag whose tag name is "noscript", if the scripting flag is enabled + * + * The scripting flag is never enabled in this parser. + */ + case '+NOEMBED': $this->insert_html_element( $this->state->current_token ); return true; @@ -1597,69 +1977,89 @@ private function step_in_body(): bool { $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); return true; - } - /* - * These tags require special handling in the 'in body' insertion mode - * but that handling hasn't yet been implemented. - * - * As the rules for each tag are implemented, the corresponding tag - * name should be removed from this list. An accompanying test should - * help ensure this list is maintained. - * - * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags - * - * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's - * possible to handle "any other start tag" and "any other end tag" below, - * as that guarantees execution doesn't proceed for the unimplemented tags. - * - * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody - */ - switch ( $token_name ) { - case 'APPLET': - case 'BASE': - case 'BASEFONT': - case 'BGSOUND': - case 'BODY': - case 'CAPTION': - case 'COL': - case 'COLGROUP': - case 'FORM': - case 'FRAME': - case 'FRAMESET': - case 'HEAD': - case 'HTML': - case 'IFRAME': - case 'LINK': - case 'MARQUEE': - case 'MATH': - case 'META': - case 'NOBR': - case 'NOEMBED': - case 'NOFRAMES': - case 'NOSCRIPT': - case 'OBJECT': - case 'PLAINTEXT': - case 'RB': - case 'RP': - case 'RT': - case 'RTC': - case 'SARCASM': - case 'SCRIPT': - case 'STYLE': - case 'SVG': - case 'TABLE': - case 'TBODY': - case 'TD': - case 'TEMPLATE': - case 'TEXTAREA': - case 'TFOOT': - case 'TH': - case 'THEAD': - case 'TITLE': - case 'TR': - case 'XMP': - $this->bail( "Cannot process {$token_name} element." ); + /* + * > A start tag whose tag name is one of: "rb", "rtc" + */ + case '+RB': + case '+RTC': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) { + $this->generate_implied_end_tags(); + + if ( $this->state->stack_of_open_elements->current_node_is( 'RUBY' ) ) { + // @todo Indicate a parse error once it's possible. + } + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "rp", "rt" + */ + case '+RP': + case '+RT': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'RUBY' ) ) { + $this->generate_implied_end_tags( 'RTC' ); + + $current_node_name = $this->state->stack_of_open_elements->current_node()->node_name; + if ( 'RTC' === $current_node_name || 'RUBY' === $current_node_name ) { + // @todo Indicate a parse error once it's possible. + } + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "math" + */ + case '+MATH': + $this->reconstruct_active_formatting_elements(); + + /* + * @todo Adjust MathML attributes for the token. (This fixes the case of MathML attributes that are not all lowercase.) + * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink.) + * + * These ought to be handled in the attribute methods. + */ + + $this->bail( 'Cannot process MATH element, opening foreign content.' ); + break; + + /* + * > A start tag whose tag name is "svg" + */ + case '+SVG': + $this->reconstruct_active_formatting_elements(); + + /* + * @todo Adjust SVG attributes for the token. (This fixes the case of SVG attributes that are not all lowercase.) + * @todo Adjust foreign attributes for the token. (This fixes the use of namespaced attributes, in particular XLink in SVG.) + * + * These ought to be handled in the attribute methods. + */ + + $this->bail( 'Cannot process SVG element, opening foreign content.' ); + break; + + /* + * > A start tag whose tag name is one of: "caption", "col", "colgroup", + * > "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr" + */ + case '+CAPTION': + case '+COL': + case '+COLGROUP': + case '+FRAME': + case '+HEAD': + case '+TBODY': + case '+TD': + case '+TFOOT': + case '+TH': + case '+THEAD': + case '+TR': + // Parse error. Ignore the token. + return $this->step(); } if ( ! parent::is_tag_closer() ) { @@ -1681,6 +2081,12 @@ private function step_in_body(): bool { * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + /* + * @todo This needs to check if the element in scope is an HTML element, meaning that + * when SVG and MathML support is added, this needs to differentiate between an + * HTML element of the given name, such as ``, and a foreign element of + * the same given name. + */ if ( $token_name === $node->node_name ) { break; } diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 77782aa950..7d04fd31d8 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -129,7 +129,7 @@ * $processor = new WP_HTML_Tag_Processor( '
' ); * true === $processor->next_tag( 'DIV' ); * - * #### Special elements + * #### Special self-contained elements * * Some HTML elements are handled in a special way; their start and end tags * act like a void tag. These are special because their contents can't contain @@ -755,6 +755,20 @@ class WP_HTML_Tag_Processor { */ protected $seek_count = 0; + /** + * Whether the parser should skip over an immediately-following linefeed + * character, as is the case with LISTING, PRE, and TEXTAREA. + * + * > If the next token is a U+000A LINE FEED (LF) character token, then + * > ignore that token and move on to the next one. (Newlines at the start + * > of [these] elements are ignored as an authoring convenience.) + * + * @since 6.7.0 + * + * @var int|null + */ + private $skip_newline_at = null; + /** * Constructor. * @@ -926,20 +940,23 @@ private function base_class_next_token(): bool { $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; /* - * For non-DATA sections which might contain text that looks like HTML tags but - * isn't, scan with the appropriate alternative mode. Looking at the first letter - * of the tag name as a pre-check avoids a string allocation when it's not needed. + * Certain tags require additional processing. The first-letter pre-check + * avoids unnecessary string allocation when comparing the tag names. + * + * - IFRAME + * - LISTING (deprecated) + * - NOEMBED (deprecated) + * - NOFRAMES (deprecated) + * - PRE + * - SCRIPT + * - STYLE + * - TEXTAREA + * - TITLE + * - XMP (deprecated) */ - $t = $this->html[ $this->tag_name_starts_at ]; if ( $this->is_closing_tag || - ! ( - 'i' === $t || 'I' === $t || - 'n' === $t || 'N' === $t || - 's' === $t || 'S' === $t || - 't' === $t || 'T' === $t || - 'x' === $t || 'X' === $t - ) + 1 !== strspn( $this->html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) ) { return true; } @@ -947,6 +964,26 @@ private function base_class_next_token(): bool { $tag_name = $this->get_tag(); /* + * For LISTING, PRE, and TEXTAREA, the first linefeed of an immediately-following + * text node is ignored as an authoring convenience. + * + * @see static::skip_newline_at + */ + if ( 'LISTING' === $tag_name || 'PRE' === $tag_name ) { + $this->skip_newline_at = $this->bytes_already_parsed; + return true; + } + + /* + * There are certain elements whose children are not DATA but are instead + * RCDATA or RAWTEXT. These cannot contain other elements, and the contents + * are parsed as plaintext, with character references decoded in RCDATA but + * not in RAWTEXT. + * + * These elements are described here as "self-contained" or special atomic + * elements whose end tag is consumed with the opening tag, and they will + * contain modifiable text inside of them. + * * Preserve the opening tag pointers, as these will be overwritten * when finding the closing tag. They will be reset after finding * the closing to tag to point to the opening of the special atomic @@ -2690,13 +2727,23 @@ public function has_self_closing_flag(): bool { * $p->is_tag_closer() === true; * * @since 6.2.0 + * @since 6.7.0 Reports all BR tags as opening tags. * * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer(): bool { return ( self::STATE_MATCHED_TAG === $this->parser_state && - $this->is_closing_tag + $this->is_closing_tag && + + /* + * The BR tag can only exist as an opening tag. If something like `
` + * appears then the HTML parser will treat it as an opening tag with no + * attributes. The BR tag is unique in this way. + * + * @see https://html.spec.whatwg.org/#parsing-main-inbody + */ + 'BR' !== $this->get_tag() ); } @@ -2825,17 +2872,38 @@ public function get_comment_type(): ?string { * that a token has modifiable text, and a token with modifiable text may * have an empty string (e.g. a comment with no contents). * + * Limitations: + * + * - This function will not strip the leading newline appropriately + * after seeking into a LISTING or PRE element. To ensure that the + * newline is treated properly, seek to the LISTING or PRE opening + * tag instead of to the first text node inside the element. + * * @since 6.5.0 + * @since 6.7.0 Replaces NULL bytes (U+0000) and newlines appropriately. * * @return string */ public function get_modifiable_text(): string { - if ( null === $this->text_starts_at ) { + if ( null === $this->text_starts_at || 0 === $this->text_length ) { return ''; } $text = substr( $this->html, $this->text_starts_at, $this->text_length ); + /* + * Pre-processing the input stream would normally happen before + * any parsing is done, but deferring it means it's possible to + * skip in most cases. When getting the modifiable text, however + * it's important to apply the pre-processing steps, which is + * normalizing newlines. + * + * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream + * @see https://infra.spec.whatwg.org/#normalize-newlines + */ + $text = str_replace( "\r\n", "\n", $text ); + $text = str_replace( "\r", "\n", $text ); + // Comment data is not decoded. if ( self::STATE_CDATA_NODE === $this->parser_state || @@ -2843,10 +2911,10 @@ public function get_modifiable_text(): string { self::STATE_DOCTYPE === $this->parser_state || self::STATE_FUNKY_COMMENT === $this->parser_state ) { - return $text; + return str_replace( "\x00", "\u{FFFD}", $text ); } - $tag_name = $this->get_tag(); + $tag_name = $this->get_token_name(); if ( // Script data is not decoded. 'SCRIPT' === $tag_name || @@ -2858,29 +2926,34 @@ public function get_modifiable_text(): string { 'STYLE' === $tag_name || 'XMP' === $tag_name ) { - return $text; + return str_replace( "\x00", "\u{FFFD}", $text ); } $decoded = WP_HTML_Decoder::decode_text_node( $text ); /* - * TEXTAREA skips a leading newline, but this newline may appear not only as the - * literal character `\n`, but also as a character reference, such as in the - * following markup: ``. + * Skip the first line feed after LISTING, PRE, and TEXTAREA opening tags. * - * For these cases it's important to first decode the text content before checking - * for a leading newline and removing it. + * Note that this first newline may come in the form of a character + * reference, such as ` `, and so it's important to perform + * this transformation only after decoding the raw text content. */ if ( - self::STATE_MATCHED_TAG === $this->parser_state && - 'TEXTAREA' === $tag_name && - strlen( $decoded ) > 0 && - "\n" === $decoded[0] + ( "\n" === ( $decoded[0] ?? '' ) ) && + ( ( $this->skip_newline_at === $this->token_starts_at && '#text' === $tag_name ) || 'TEXTAREA' === $tag_name ) ) { - return substr( $decoded, 1 ); + $decoded = substr( $decoded, 1 ); } - return $decoded; + /* + * Only in normative text nodes does the NULL byte (U+0000) get removed. + * In all other contexts it's replaced by the replacement character (U+FFFD) + * for security reasons (to avoid joining together strings that were safe + * when separated, but not when joined). + */ + return '#text' === $tag_name + ? str_replace( "\x00", '', $decoded ) + : str_replace( "\x00", "\u{FFFD}", $decoded ); } /** diff --git a/wp-includes/html-api/class-wp-html-token.php b/wp-includes/html-api/class-wp-html-token.php index fe8636fb5e..948fe343df 100644 --- a/wp-includes/html-api/class-wp-html-token.php +++ b/wp-includes/html-api/class-wp-html-token.php @@ -72,12 +72,13 @@ class WP_HTML_Token { * * @since 6.4.0 * - * @param string $bookmark_name Name of bookmark corresponding to location in HTML where token is found. + * @param string|null $bookmark_name Name of bookmark corresponding to location in HTML where token is found, + * or `null` for markers and nodes without a bookmark. * @param string $node_name Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker". * @param bool $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid. * @param callable|null $on_destroy Optional. Function to call when destroying token, useful for releasing the bookmark. */ - public function __construct( string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) { + public function __construct( ?string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) { $this->bookmark_name = $bookmark_name; $this->node_name = $node_name; $this->has_self_closing_flag = $has_self_closing_flag; diff --git a/wp-includes/version.php b/wp-includes/version.php index 0488739be6..3de31b7b78 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-58778'; +$wp_version = '6.7-alpha-58779'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.