From 2b2d6fe93b2af0395a46bbfe3b51ad32d780d3b9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 01:57:45 -0700 Subject: [PATCH 1/9] HTML API: Simplify breadcrumb accounting. Since the HTML Processor started visiting all nodes in a document, both real and virtual, the breadcrumb accounting became a bit complicated and it's not entirely clear that it is fully reliable. In this patch the breadcrumbs are rebuilt separately from the stack of open elements in order to eliminate the problem of the stateful stack interactions and the post-hoc event queue. Breadcrumbs are greatly simplified as a result, and more verifiably correct, in this construction. --- .../html-api/class-wp-html-processor.php | 126 +++++++----------- .../html-api/wpHtmlProcessorSemanticRules.php | 11 +- 2 files changed, 61 insertions(+), 76 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 588d2fbe7d7c9..8c75bd3f06b21 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -211,6 +211,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $element_queue = array(); + /** + * Stores the current breadcrumbs. + * + * @since 6.7.0 + * + * @var string[] + */ + private $breadcrumbs = array(); + /** * Current stack event, if set, representing a matched token. * @@ -310,8 +319,8 @@ public static function create_fragment( $html, $context = '', $encoding = false ); - $processor->state->stack_of_open_elements->push( $context_node ); $processor->context_node = $context_node; + $processor->breadcrumbs = array( 'HTML', $context_node->node_name ); return $processor; } @@ -523,44 +532,46 @@ public function next_token() { return false; } - if ( 'done' !== $this->has_seen_context_node && 0 === count( $this->element_queue ) && ! $this->step() ) { - while ( 'context-node' !== $this->state->stack_of_open_elements->current_node()->bookmark_name && $this->state->stack_of_open_elements->pop() ) { - continue; - } - $this->has_seen_context_node = 'done'; - return $this->next_token(); + /* + * Prime the events if there are none. + * + * @todo In some cases, probably related to the adoption agency + * algorithm, this call to step() doesn't create any new + * events. Calling it again creates them. Figure out why + * this is and if it's inherent or if it's a bug. Looping + * until there are events or until there are no more + * tokens works in the meantime and isn't obviously wrong. + */ + while ( empty( $this->element_queue ) && $this->step() ) { + continue; } + // Process the next event on the queue. $this->current_element = array_shift( $this->element_queue ); - while ( isset( $this->context_node ) && ! $this->has_seen_context_node ) { - if ( isset( $this->current_element ) ) { - if ( $this->context_node === $this->current_element->token && WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $this->has_seen_context_node = true; - return $this->next_token(); - } - } - $this->current_element = array_shift( $this->element_queue ); + if ( ! isset( $this->current_element ) ) { + return false; } - if ( ! isset( $this->current_element ) ) { - if ( 'done' === $this->has_seen_context_node ) { - return false; - } else { - return $this->next_token(); - } + $is_pop = WP_HTML_Stack_Event::POP === $this->current_element->operation; + + /* + * The root node only exists in the fragment parser, and closing it + * indicates that the parse is complete. Stop before popping if from + * the breadcrumbs. + */ + if ( 'root-node' === $this->current_element->token->bookmark_name ) { + return ! $is_pop && $this->next_token(); } - if ( isset( $this->context_node ) && WP_HTML_Stack_Event::POP === $this->current_element->operation && $this->context_node === $this->current_element->token ) { - $this->element_queue = array(); - $this->current_element = null; - return false; + // Adjust the breadcrumbs for this event. + if ( $is_pop ) { + array_pop( $this->breadcrumbs ); + } else { + $this->breadcrumbs[] = $this->current_element->token->node_name; } // Avoid sending close events for elements which don't expect a closing. - if ( - WP_HTML_Stack_Event::POP === $this->current_element->operation && - ! static::expects_closer( $this->current_element->token ) - ) { + if ( $is_pop && ! static::expects_closer( $this->current_element->token ) ) { return $this->next_token(); } @@ -643,10 +654,11 @@ public function matches_breadcrumbs( $breadcrumbs ) { return false; } - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) { + $node = $this->breadcrumbs[ $i ]; $crumb = strtoupper( current( $breadcrumbs ) ); - if ( '*' !== $crumb && $node->node_name !== $crumb ) { + if ( '*' !== $crumb && $node !== $crumb ) { return false; } @@ -862,46 +874,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - $breadcrumbs = array(); - - foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { - $breadcrumbs[] = $stack_item->node_name; - } - - if ( ! $this->is_virtual() ) { - return $breadcrumbs; - } - - foreach ( $this->element_queue as $queue_item ) { - if ( $this->current_element->token->bookmark_name === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'context-node' === $queue_item->token->bookmark_name ) { - break; - } - - if ( 'real' === $queue_item->provenance ) { - break; - } - - if ( WP_HTML_Stack_Event::PUSH === $queue_item->operation ) { - $breadcrumbs[] = $queue_item->token->node_name; - } else { - array_pop( $breadcrumbs ); - } - } - - if ( null !== parent::get_token_name() && ! parent::is_tag_closer() ) { - array_pop( $breadcrumbs ); - } - - // Add the virtual node we're at. - if ( WP_HTML_Stack_Event::PUSH === $this->current_element->operation ) { - $breadcrumbs[] = $this->current_element->token->node_name; - } - - return $breadcrumbs; + return $this->breadcrumbs; } /** @@ -930,9 +903,7 @@ public function get_breadcrumbs() { * @return int Nesting-depth of current location in the document. */ public function get_current_depth() { - return $this->is_virtual() - ? count( $this->get_breadcrumbs() ) - : $this->state->stack_of_open_elements->count(); + return count( $this->breadcrumbs ); } /** @@ -2552,7 +2523,6 @@ public function seek( $bookmark_name ) { ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start : 0; $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; - $bookmark_length = $this->bookmarks[ $actual_bookmark_name ]->length; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; /* @@ -2610,6 +2580,12 @@ public function seek( $bookmark_name ) { $this->state->frameset_ok = true; $this->element_queue = array(); $this->current_element = null; + + if ( isset( $this->context_node ) ) { + $this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 ); + } else { + $this->breadcrumbs = array(); + } } // When moving forwards, reparse the document until reaching the same location as the original bookmark. diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php index 717276935a780..adce614506429 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php @@ -387,7 +387,16 @@ public function test_in_body_any_other_end_tag_with_unclosed_non_special_element $this->assertSame( 'CODE', $processor->get_tag(), "Expected to start test on CODE element but found {$processor->get_tag()} instead." ); $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'SPAN', 'CODE' ), $processor->get_breadcrumbs(), 'Failed to produce expected DOM nesting.' ); - $this->assertTrue( $processor->next_token(), 'Failed to advance past CODE tag to expected SPAN closer.' ); + $this->assertTrue( + $processor->next_tag( + array( + 'tag_name' => 'SPAN', + 'tag_closers' => 'visit', + ) + ), + 'Failed to advance past CODE tag to expected SPAN closer.' + ); + $this->assertSame( 'SPAN', $processor->get_tag() ); $this->assertTrue( $processor->is_tag_closer(), 'Expected to find closing SPAN, but found opener instead.' ); $this->assertSame( array( 'HTML', 'BODY', 'DIV' ), $processor->get_breadcrumbs(), 'Failed to advance past CODE tag to expected DIV opener.' ); From 6962fa29654f9a85d5c3136cd5ca458dceb8c5ad Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 16:25:15 -0700 Subject: [PATCH 2/9] HTML API: Expand Unsupported class and make it available for debugging. The HTML Processor internally throws an exception when it reaches HTML that it knows it cannot process, but this exception is not made available to calling code. It can be useful to extract more knowledge about why it gave up, especially for debugging purposes. In this patch, more context is added to the WP_HTML_Unsupported_Exception and the last exception is made available to calling code, if it asks. --- .../html-api/class-wp-html-processor.php | 65 +++++++++++++++++++ .../class-wp-html-unsupported-exception.php | 18 ++++- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8c75bd3f06b21..f3be6c67064b9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -188,6 +188,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $last_error = null; + /** + * Stores context for why the parser bailed on unsupported HTML, if it did. + * + * @see self::get_unsupported_exception + * + * @since 6.7.0 + * + * @var WP_HTML_Unsupported_Exception|null + */ + private $unsupported_exception = null; + /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * @@ -384,6 +395,45 @@ function ( WP_HTML_Token $token ) { }; } + /** + * Stops the parser and terminates its execution when encountering unsupported markup. + * + * @throws WP_HTML_Unsupported_Exception Halts execution of the parser. + * + * @since 6.7.0 + * + * @param string $message Explains support is missing in order to parse the current node. + * + * @return mixed + */ + private function bail( string $message ) { + $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + $token = substr( $this->html, $here->start, $here->length ); + + $open_elements = array(); + foreach ( $this->state->stack_of_open_elements->stack as $item ) { + $open_elements[] = $item->node_name; + } + + $active_formats = array(); + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + $active_formats[] = $item->node_name; + } + + $this->last_error = self::ERROR_UNSUPPORTED; + + $this->unsupported_exception = new WP_HTML_Unsupported_Exception( + $message, + $this->state->current_token->node_name, + $here->start, + $token, + $open_elements, + $active_formats + ); + + throw $this->unsupported_exception; + } + /** * Returns the last error, if any. * @@ -411,6 +461,21 @@ public function get_last_error() { return $this->last_error; } + /** + * Returns context for why the parser aborted due to unsupported HTML, if it did. + * + * This is meant for debugging purposes, not for production use. + * + * @since 6.7.0 + * + * @see self::$unsupported_exception + * + * @return WP_HTML_Unsupported_Exception|null + */ + public function get_unsupported_exception() { + return $this->unsupported_exception; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php index 6e7228670bf8b..1a29714727623 100644 --- a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php +++ b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php @@ -1,4 +1,4 @@ -token_name = $token_name; + $this->token_at = $token_at; + $this->token = $token; + + $this->stack_of_open_elements = $stack_of_open_elements; + $this->active_formatting_elements = $active_formatting_elements; + } } From ab1096fdc53c48e503c274914020e1c11e4529b9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 6 Jul 2024 16:10:15 -0700 Subject: [PATCH 3/9] HTML API: Implement "reconstruct the active formatting elements" algorithm. As part of work to add more spec support to the HTML API, this patch fills out the active format reconstruction algorithm so that more HTML can be supported in situations requiring that reconstruction, for example, when a formatting element such as an A tag or a CODE tag is implicitly closed. See Core-61576 --- ...ass-wp-html-active-formatting-elements.php | 32 +++++++++ .../html-api/class-wp-html-processor.php | 45 +++++++++++-- .../tests/html-api/wpHtmlProcessor.php | 15 +++-- .../html-api/wpHtmlProcessorBreadcrumbs.php | 66 +++++++++++-------- 4 files changed, 120 insertions(+), 38 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 9f7fee9076243..e45e55e09dd28 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -43,6 +43,22 @@ class WP_HTML_Active_Formatting_Elements { */ private $stack = array(); + /** + * Returns the node at the given index in the list of active formatting elements. + * + * Do not use this method; it is meant to be used only by the HTML Processor. + * + * @since 6.7.0 + * + * @access private + * + * @param int $index Number of nodes from the top node to return. + * @return WP_HTML_Token|null Node at the given index in the stack, if one exists, otherwise null. + */ + public function at( $index ) { + return $this->stack[ $index ]; + } + /** * Reports if a specific node is in the stack of active formatting elements. * @@ -86,6 +102,22 @@ public function current_node() { return $current_node ? $current_node : null; } + /** + * Inserts a "marker" at the end of the list of active formatting elements. + * + * > The markers are inserted when entering applet, object, marquee, + * > template, td, th, and caption elements, and are used to prevent + * > formatting from "leaking" into applet, object, marquee, template, + * > td, th, and caption elements. + * + * @see https://html.spec.whatwg.org/#concept-parser-marker + * + * @since 6.7.0 + */ + public function insert_marker() { + $this->push( new WP_HTML_Token( null, 'marker', false ) ); + } + /** * Pushes a node onto the stack of active formatting elements. * diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f3be6c67064b9..aa285727b6b4c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -2864,7 +2864,7 @@ private function generate_implied_end_tags_thoroughly() { * > in the current body, cell, or caption (whichever is youngest) that haven't * > been explicitly closed. * - * @since 6.4.0 + * @since 6.7.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * @@ -2873,15 +2873,19 @@ private function generate_implied_end_tags_thoroughly() { * @return bool Whether any formatting elements needed to be reconstructed. */ private function reconstruct_active_formatting_elements() { + $count = $this->state->active_formatting_elements->count(); + /* * > If there are no entries in the list of active formatting elements, then there is nothing * > to reconstruct; stop this algorithm. */ - if ( 0 === $this->state->active_formatting_elements->count() ) { + if ( 0 === $count ) { return false; } - $last_entry = $this->state->active_formatting_elements->current_node(); + // Start at the last node in the list of active formatting elements. + $currently_at = $count - 1; + $last_entry = $this->state->active_formatting_elements->at( $currently_at ); if ( /* @@ -2900,8 +2904,39 @@ private function reconstruct_active_formatting_elements() { return false; } - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + $entry = $last_entry; + + while ( $currently_at >= 0 ) { + if ( 0 === $currently_at ) { + goto create; + } + $entry = $this->state->active_formatting_elements->at( --$currently_at ); + + /* + * > If entry is neither a marker nor an element that is also in the stack of open elements, + * > go to the step labeled rewind. + */ + if ( 'marker' === $entry->node_name || $this->state->stack_of_open_elements->contains_node( $entry ) ) { + break; + } + } + + advance: + $entry = $this->state->active_formatting_elements->at( ++$currently_at ); + + create: + $this->insert_html_element( $entry ); + + /* + * > Replace the entry for entry in the list with an entry for new element. + * This doesn't need to happen here since no DOM is being created. + */ + + if ( $count - 1 !== $currently_at ) { + goto advance; + } + + return true; } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index b842703a7a135..8294aa5b198b3 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -112,18 +112,23 @@ public function test_clear_to_navigate_after_seeking() { } /** - * Ensures that support is added for reconstructing active formatting elements - * before the HTML Processor handles situations with unclosed formats requiring it. + * Ensures that support is added for reconstructing active formatting elements. * * @ticket 58517 * * @covers WP_HTML_Processor::reconstruct_active_formatting_elements */ - public function test_fails_to_reconstruct_formatting_elements() { - $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); + public function test_reconstructs_formatting_elements() { + $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find first EM.' ); - $this->assertFalse( $processor->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' ); + $this->assertSame( array( 'HTML', 'BODY', 'P', 'EM' ), $processor->get_breadcrumbs(), 'Found incorrect breadcrumbs for first EM.' ); + $this->assertTrue( $processor->next_tag( 'SPAN' ), 'Could not find test span.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'EM', 'EM', 'SPAN' ), + $processor->get_breadcrumbs(), + 'Found incorrect breadcrumbs for test SPAN; should have created two EMs.' + ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 403f40a1da032..5ec846df16fec 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -219,45 +219,55 @@ public static function data_unsupported_elements() { } /** - * @ticket 58517 - * - * @dataProvider data_unsupported_markup + * Ensures that formats inside unclosed A elements are reconstructed. * - * @param string $html HTML containing unsupported markup. + * @ticket 61576 */ - public function test_fails_when_encountering_unsupported_markup( $html, $description ) { - $processor = WP_HTML_Processor::create_fragment( $html ); - - while ( $processor->next_token() && null === $processor->get_attribute( 'supported' ) ) { - continue; - } + public function test_reconstructs_formatting_from_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( 'Click Here' ); - $this->assertNull( - $processor->get_last_error(), - 'Bailed on unsupported input before finding supported checkpoint: check test code.' + $processor->next_tag( 'STRONG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'A', 'STRONG' ), + $processor->get_breadcrumbs(), + 'Failed to construct starting breadcrumbs properly.' ); - $this->assertTrue( $processor->get_attribute( 'supported' ), 'Did not find required supported element.' ); - $processor->next_token(); - $this->assertNotNull( $processor->get_last_error(), "Didn't properly reject unsupported markup: {$description}" ); + $processor->next_tag( 'BIG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'STRONG', 'A', 'BIG' ), + $processor->get_breadcrumbs(), + 'Failed to reconstruct the active formatting elements after an unclosed A element.' + ); } /** - * Data provider. + * Ensures that unclosed A elements are reconstructed. * - * @return array[] + * @ticket 61576 */ - public static function data_unsupported_markup() { - return array( - 'A with formatting following unclosed A' => array( - 'Click Here', - 'Unclosed formatting requires complicated reconstruction.', - ), + public function test_reconstructs_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( '

' ); - 'A after unclosed A inside DIV' => array( - '
', - 'A is a formatting element, which requires more complicated reconstruction.', - ), + $processor->next_tag( 'DIV' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV' ), + $processor->get_breadcrumbs(), + 'Failed to construct breadcrumbs properly - the DIV should have closed the A element.' + ); + + // When the DIV re-opens, it reconstructs an unclosed A, then the A in the text is a second A. + $processor->next_tag( 'A' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'A' ), + 'Failed to create proper breadcrumbs for recreated A element.' + ); + + // This is the one that's second in the raw text. + $processor->next_tag( 'A' ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'A' ), + 'Failed to create proper breadcrumbs for explicit A element - this A should have closed the reconstructed A.' ); } From 374cba3ae0920ba05426acb5c5ea7d3b4396da2b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Aug 2024 00:33:04 -0700 Subject: [PATCH 4/9] Continue iterating --- ...ass-wp-html-active-formatting-elements.php | 6 +- .../html-api/class-wp-html-processor.php | 80 ++++++++++++------- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index b5547c4ff98f3..2428f9a88e8f1 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -44,7 +44,7 @@ class WP_HTML_Active_Formatting_Elements { private $stack = array(); /** - * Returns the node at the given index in the list of active formatting elements. + * Returns the node at the given 1-offset index in the list of active formatting elements. * * Do not use this method; it is meant to be used only by the HTML Processor. * @@ -55,8 +55,8 @@ class WP_HTML_Active_Formatting_Elements { * @param int $index Number of nodes from the top node to return. * @return WP_HTML_Token|null Node at the given index in the stack, if one exists, otherwise null. */ - public function at( $index ) { - return $this->stack[ $index ]; + public function at( $nth ) { + return $this->stack[ $nth - 1 ]; } /** diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 24e3d01340c0b..cadd86b8aa240 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5093,63 +5093,83 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { private function reconstruct_active_formatting_elements(): bool { $count = $this->state->active_formatting_elements->count(); /* - * > If there are no entries in the list of active formatting elements, then there is nothing - * > to reconstruct; stop this algorithm. + * > 1. If there are no entries in the list of active formatting elements, + * > then there is nothing to reconstruct; stop this algorithm. */ if ( 0 === $count ) { return false; } - // Start at the last node in the list of active formatting elements. - $currently_at = $count - 1; + $currently_at = $count; $last_entry = $this->state->active_formatting_elements->at( $currently_at ); + /* + * > 2. If the last (most recently added) entry in the list of active formatting + * > elements is a marker, or if it is an element that is in the stack of open + * > elements, then there is nothing to reconstruct; stop this algorithm. + */ if ( - - /* - * > If the last (most recently added) entry in the list of active formatting elements is a marker; - * > stop this algorithm. - */ 'marker' === $last_entry->node_name || - - /* - * > If the last (most recently added) entry in the list of active formatting elements is an - * > element that is in the stack of open elements, then there is nothing to reconstruct; - * > stop this algorithm. - */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; } + /* + * > 3. Let entry be the last (most recently added) element + * > in the list of active formatting elements. + */ $entry = $last_entry; - while ( $currently_at >= 0 ) { - if ( 0 === $currently_at ) { - goto create; - } - $entry = $this->state->active_formatting_elements->at( --$currently_at ); + /* + * > 4. Rewind: If there are no entries before entry in the list of active + * > formatting elements, then jump to the step labeled create. + */ + rewind: + if ( 1 === $currently_at ) { + goto create; + } - /* - * > If entry is neither a marker nor an element that is also in the stack of open elements, - * > go to the step labeled rewind. - */ - if ( 'marker' === $entry->node_name || $this->state->stack_of_open_elements->contains_node( $entry ) ) { - break; - } + /* + * > 5. Let entry be the entry one earlier than entry + * > in the list of active formatting elements. + */ + $entry = $this->state->active_formatting_elements->at( --$currently_at ); + + /* + * > 6. If entry is neither a marker nor an element that is also in + * > the stack of open elements, go to the step labeled rewind. + */ + if ( + 'marker' !== $entry->node_name && + ! $this->state->stack_of_open_elements->contains_node( $entry ) + ) { + goto rewind; } + /* + * > 7. Advance: Let entry be the element one later than entry + * > in the list of active formatting elements. + */ advance: $entry = $this->state->active_formatting_elements->at( ++$currently_at ); + /* + * > 8. Create: Insert an HTML element for the token for which the + * > element entry was created, to obtain new element. + */ create: $this->insert_html_element( $entry ); /* - * > Replace the entry for entry in the list with an entry for new element. - * This doesn't need to happen here since no DOM is being created. + * > 9. Replace the entry for entry in the list with an entry for new element. + * > This doesn't need to happen here since no DOM is being created. */ - if ( $count - 1 !== $currently_at ) { + /* + * > 10. If the entry for new element in the list of active formatting elements + * > is not the last entry in the list, return to the step labeled advance. + */ + if ( $count !== $currently_at ) { goto advance; } From 0c2f0c342110898cb04997bd184ba2ea3bad46ce Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 5 Sep 2024 19:52:38 -0700 Subject: [PATCH 5/9] WIP: Explore reading attributes from recreated formatting elements. --- ...ass-wp-html-active-formatting-elements.php | 28 +++++++- .../html-api/class-wp-html-processor.php | 71 +++++++++++++++++-- .../html-api/class-wp-html-tag-processor.php | 7 +- .../html-api/wpHtmlProcessorHtml5lib.php | 5 ++ 4 files changed, 98 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 2428f9a88e8f1..9b90d2d367d00 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -126,8 +126,9 @@ public function insert_marker(): void { * @see https://html.spec.whatwg.org/#push-onto-the-list-of-active-formatting-elements * * @param WP_HTML_Token $token Push this node onto the stack. + * @return bool Whether a node was pushed onto the stack of active formatting elements. */ - public function push( WP_HTML_Token $token ) { + public function push( WP_HTML_Token $token ): bool { /* * > If there are already three elements in the list of active formatting elements after the last marker, * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and @@ -136,11 +137,32 @@ public function push( WP_HTML_Token $token ) { * > created by the parser; two elements have the same attributes if all their parsed attributes can be * > paired such that the two attributes in each pair have identical names, namespaces, and values * > (the order of the attributes does not matter). - * - * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. */ + + if ( 'marker' !== $token->node_name ) { + $existing_count = 0; + foreach ( $this->walk_up() as $item ) { + if ( 'marker' === $item->node_name ) { + break; + } + + if ( + $item->node_name === $token->node_name && + $item->namespace === $token->namespace + // @todo Compare attributes. For now, bail if there are three matching tag names + namespaces. + ) { + ++$existing_count; + if ( $existing_count >= 3 ) { + // @todo Implement removing the earliest element and moving forward. + return false; + } + } + } + } + // > Add element to the list of active formatting elements. $this->stack[] = $token; + return true; } /** diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 872f765888fdf..3deb023e5beb6 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -256,6 +256,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $context_node = null; + /** + * If a formatting element has been reconstructed, this will hold + * the parsed attributes from the original format, once requested. + * + * These attributes are not modifiable. + * + * @since 6.7.0 + * + * @var array|null + */ + protected $actively_reconstructed_formatting_attributes = array(); + /* * Public Interface Functions */ @@ -2346,7 +2358,10 @@ private function step_in_body(): bool { $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2367,7 +2382,10 @@ private function step_in_body(): bool { case '+U': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2383,7 +2401,10 @@ private function step_in_body(): bool { } $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -4845,7 +4866,27 @@ public function get_token_type(): ?string { * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { - return $this->is_virtual() ? null : parent::get_attribute( $name ); + if ( $this->is_virtual() ) { + $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; + if ( null === $virtual_attributes ) { + return null; + } + + $current_attributes = $this->attributes; + $current_updates = $this->lexical_updates; + $this->lexical_updates = array(); + $this->attributes = $virtual_attributes; + $parser_state = $this->parser_state; + $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; + $attribute_names = parent::get_attribute( $name ); + $this->attributes = $current_attributes; + $this->parser_state = $parser_state; + $this->lexical_updates = $current_updates; + + return $attribute_names; + } + + return parent::get_attribute( $name ); } /** @@ -4906,7 +4947,24 @@ public function remove_attribute( $name ): bool { * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ): ?array { - return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix ); + if ( $this->is_virtual() ) { + $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; + if ( null === $virtual_attributes ) { + return null; + } + + $current_attributes = $this->attributes; + $this->attributes = $virtual_attributes; + $parser_state = $this->parser_state; + $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; + $attribute_names = parent::get_attribute_names_with_prefix( $prefix ); + $this->attributes = $current_attributes; + $this->parser_state = $parser_state; + + return $attribute_names; + } + + return parent::get_attribute_names_with_prefix( $prefix ); } /** @@ -5400,6 +5458,7 @@ private function reconstruct_active_formatting_elements(): bool { */ rewind: if ( 1 === $currently_at ) { + echo "\e[90mJumping to create\e[m\n"; goto create; } @@ -5435,7 +5494,7 @@ private function reconstruct_active_formatting_elements(): bool { $this->insert_html_element( $entry ); /* - * > 9. Replace the entry for entry in the list with an entry for new element. + * > 9. Replace the entry for _entry_ in the list with an entry for new element. * > This doesn't need to happen here since no DOM is being created. */ diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1ea8066d97ade..8e6183e7748c0 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -708,7 +708,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var WP_HTML_Attribute_Token[] */ - private $attributes = array(); + protected $attributes = array(); /** * Tracks spans of duplicate attributes on a given tag, used for removing @@ -2973,8 +2973,7 @@ public function get_qualified_tag_name(): ?string { * * @since 6.7.0 * - * @param string $attribute_name Which attribute to adjust. - * + * @param string $attribute_name Which attribute to adjust. * @return string|null */ public function get_qualified_attribute_name( $attribute_name ): ?string { @@ -2982,7 +2981,7 @@ public function get_qualified_attribute_name( $attribute_name ): ?string { return null; } - $namespace = $this->get_namespace(); + $namespace = $namespace_override ?? $this->get_namespace(); $lower_name = strtolower( $attribute_name ); if ( 'math' === $namespace && 'definitionurl' === $lower_name ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 54d60f8c78a66..e7dd53a7bfc8e 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -153,6 +153,9 @@ private static function should_skip_test( ?string $test_context_element, string /** * Generates the tree-like structure represented in the Html5lib tests. * + * @throws WP_HTML_Unsupported_Exception Raises unsupported exceptions for test reporting. + * @throws Error For unexpected "impossible" cases. + * * @param string|null $fragment_context Context element in which to parse HTML, such as BODY or SVG. * @param string $html Given test HTML. * @return string|null Tree structure of parsed HTML, if supported, else null. @@ -161,6 +164,7 @@ private static function build_tree_representation( ?string $fragment_context, st $processor = $fragment_context ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) : WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); } @@ -271,6 +275,7 @@ static function ( $a, $b ) { foreach ( $sorted_attributes as $attribute_name => $display_name ) { $val = $processor->get_attribute( $attribute_name ); + /* * Attributes with no value are `true` with the HTML API, * We map use the empty string value in the tree structure. From 8b00f8666053883db685352a0e3123c4266bf48b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 3 Jan 2025 08:59:23 +0100 Subject: [PATCH 6/9] Update 6.7.0 version to 6.8.0 --- .../html-api/class-wp-html-active-formatting-elements.php | 2 +- src/wp-includes/html-api/class-wp-html-processor.php | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 9b90d2d367d00..19d899cbed685 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -48,7 +48,7 @@ class WP_HTML_Active_Formatting_Elements { * * Do not use this method; it is meant to be used only by the HTML Processor. * - * @since 6.7.0 + * @since 6.8.0 * * @access private * diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 927dfe131d8a3..d54c2e495ca41 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -262,7 +262,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * These attributes are not modifiable. * - * @since 6.7.0 + * @since 6.8.0 * * @var array|null */ @@ -5884,7 +5884,7 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { * > been explicitly closed. * * @since 6.4.0 - * @since 6.7.0 Added additional support. + * @since 6.8.0 Added additional support. * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * From 026c3c2300dd6a572dc6fb75cb78acd489521b3b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 3 Jan 2025 09:02:28 +0100 Subject: [PATCH 7/9] Improve actively_reconstructed_formatting_attributes type annotation --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index d54c2e495ca41..7bb8bdfc566d6 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -264,7 +264,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * @since 6.8.0 * - * @var array|null + * @var array */ protected $actively_reconstructed_formatting_attributes = array(); From 8e5a42393b9dcde835e2b84c5fbed4aee252165b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 3 Jan 2025 11:34:17 +0100 Subject: [PATCH 8/9] Ensure that get_qualified_attribute_name works with virtual tokens --- .../html-api/class-wp-html-processor.php | 19 +++++++++++++ .../html-api/class-wp-html-tag-processor.php | 27 ++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 7bb8bdfc566d6..d437888740545 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5306,6 +5306,25 @@ public function get_attribute( $name ) { return parent::get_attribute( $name ); } + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the current parsing context, whether HTML, SVG, or MathML. + * + * @since 6.8.0 Subclassed for the HTML Processor. + * + * @param string $attribute_name Which attribute name to adjust. + * + * @return string|null The qualified attribute name or null if not on matched tag. + */ + public function get_qualified_attribute_name( $attribute_name ): ?string { + if ( $this->is_virtual() ) { + $namespace = $this->current_element->token->namespace; + return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); + } + + return parent::get_qualified_attribute_name( $attribute_name ); + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 604d3a9b0c98d..ef5d845142c11 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2979,22 +2979,37 @@ public function get_qualified_tag_name(): ?string { * * @since 6.7.0 * - * @param string $attribute_name Which attribute to adjust. - * @return string|null + * @param string $attribute_name Which attribute name to adjust. + * + * @return string|null The qualified attribute name or null if not on matched tag. */ public function get_qualified_attribute_name( $attribute_name ): ?string { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } + $namespace = $this->get_namespace(); + return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); + } - $namespace = $namespace_override ?? $this->get_namespace(); + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the provided namespace. + * + * @since 6.8.0 + * + * @param string $ns The namespace to use: 'html', 'svg', or 'math'. + * @param string $attribute_name Which attribute to adjust. + * + * @return string The qualified attribute name. + */ + final protected static function lookup_qualified_attribute_name( string $ns, string $attribute_name ): string { $lower_name = strtolower( $attribute_name ); - if ( 'math' === $namespace && 'definitionurl' === $lower_name ) { + if ( 'math' === $ns && 'definitionurl' === $lower_name ) { return 'definitionURL'; } - if ( 'svg' === $this->get_namespace() ) { + if ( 'svg' === $ns ) { switch ( $lower_name ) { case 'attributename': return 'attributeName'; @@ -3172,7 +3187,7 @@ public function get_qualified_attribute_name( $attribute_name ): ?string { } } - if ( 'html' !== $namespace ) { + if ( 'html' !== $ns ) { switch ( $lower_name ) { case 'xlink:actuate': return 'xlink actuate'; From d2e5aab12315e1ec417347e023a10fae22d3f889 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 3 Jan 2025 12:58:20 +0100 Subject: [PATCH 9/9] Remove debugging code --- src/wp-includes/html-api/class-wp-html-processor.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index d437888740545..d82a3f483a98d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5947,7 +5947,6 @@ private function reconstruct_active_formatting_elements(): bool { */ rewind: if ( 1 === $currently_at ) { - echo "\e[90mJumping to create\e[m\n"; goto create; }