From 3bd8aaebbca14c0047ac0748b716ad10068a4ba2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 15 Feb 2024 08:55:26 -0700 Subject: [PATCH 1/7] HTML API: Defer applying attribute updates until necessary. When making repeated updates to a document, the Tag Processor will end up copying the entire document once for every update. This can lead to catastrophic behavior in the worse case. However, when batch-applying updates it's able to copy chunks of the document in one thread and only end up copying the entire document once for the entire batch. Previously the Tag Processor has been eagerly applying udpates, but in this patch it defers applying those updates as long as is possible. Co-authored-by: Bernie Reiter Co-authored-by: Jon Surrell --- .../html-api/class-wp-html-tag-processor.php | 29 +++++++++++- .../html-api/wpHtmlTagProcessor-bookmark.php | 11 ++++- .../tests/html-api/wpHtmlTagProcessor.php | 45 +++++++++++++++++++ 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index d5e43251af256..e662e7a36efcf 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -838,7 +838,7 @@ public function next_tag( $query = null ) { */ public function next_token() { $was_at = $this->bytes_already_parsed; - $this->get_updated_html(); + $this->after_tag(); // Don't proceed if there's nothing more to scan. if ( @@ -2041,6 +2041,31 @@ private function skip_whitespace() { * @since 6.2.0 */ private function after_tag() { + /* + * There could be lexical updates enqueued for an attribute that + * also exists on the next tag. In order to avoid conflating the + * attributes across the two tags, lexical updates with names + * need to be flushed to raw lexical updates. + */ + $this->class_name_updates_to_attributes_updates(); + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->apply_attributes_updates(); + break; + } + + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } + $this->token_starts_at = null; $this->token_length = null; $this->tag_name_starts_at = null; @@ -2230,7 +2255,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. - if ( $diff->start <= $this->bytes_already_parsed ) { + if ( $diff->start < $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php index a0a3b2aa44b4b..be4b6398751af 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php @@ -293,21 +293,30 @@ public function test_bookmarks_complex_use_case() { /** * @ticket 56299 + * @ticket 60697 Added additional assertion to verify that a requested update was properly applied. * * @covers WP_HTML_Tag_Processor::seek */ public function test_updates_bookmark_for_additions_after_both_sides() { $processor = new WP_HTML_Tag_Processor( '
First
Second
' ); $processor->next_tag(); + $processor->set_attribute( 'id', 'one' ); $processor->set_bookmark( 'first' ); $processor->next_tag(); + $processor->set_attribute( 'id', 'two' ); $processor->add_class( 'second' ); $processor->seek( 'first' ); $processor->add_class( 'first' ); $this->assertSame( - '
First
Second
', + 'one', + $processor->get_attribute( 'id' ), + 'Should have remembered attribute change from before the seek.' + ); + + $this->assertSame( + '
First
Second
', $processor->get_updated_html(), 'The bookmark was updated incorrectly in response to HTML markup updates' ); diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 5375a2fca0ebf..824630b33516a 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -2727,4 +2727,49 @@ public function test_single_text_node_with_taglike_text() { $this->assertSame( '#text', $processor->get_token_type(), 'Did not find text node.' ); $this->assertSame( 'test< /A>', $processor->get_modifiable_text(), 'Did not find complete text node.' ); } + + /** + * Ensures that updates which are enqueued in front of the cursor + * are applied before moving forward in the document. + * + * @ticket 60697 + */ + public function test_applies_updates_before_proceeding() { + $html = '
'; + + $subclass = new class( $html ) extends WP_HTML_Tag_Processor { + /** + * Inserts raw text after the current token. + * + * @param string $new_html Raw text to insert. + */ + public function insert_after( $new_html ) { + $this->set_bookmark( 'here' ); + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->bookmarks['here']->start + $this->bookmarks['here']->length + 1, + 0, + $new_html + ); + } + }; + + $subclass->next_tag( 'img' ); + $subclass->insert_after( '

snow-capped

' ); + + $subclass->next_tag(); + $this->assertSame( + 'P', + $subclass->get_tag(), + 'Should have matched inserted HTML as next tag.' + ); + + $subclass->next_tag( 'img' ); + $subclass->set_attribute( 'alt', 'mountain' ); + + $this->assertSame( + '

snow-capped

mountain
', + $subclass->get_updated_html(), + 'Should have properly applied the update from in front of the cursor.' + ); + } } From faf42382c1500dc71fc6114b94e34ae535e5ffe6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 8 Mar 2024 17:25:32 -0700 Subject: [PATCH 2/7] Call `get_updated_html()` to ensure proper application of updates. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e662e7a36efcf..378a0879a6f2d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2054,7 +2054,7 @@ private function after_tag() { * before proceeding, otherwise they may be overlooked. */ if ( $update->start >= $this->bytes_already_parsed ) { - $this->apply_attributes_updates(); + $this->get_updated_html(); break; } From c93e8bad4aa67bdd7994599380b4ab958e1436d6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 8 Mar 2024 17:29:28 -0700 Subject: [PATCH 3/7] Split next_token into an internal and public function. --- .../html-api/class-wp-html-tag-processor.php | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 378a0879a6f2d..f4bef9d7be0d6 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -837,6 +837,10 @@ public function next_tag( $query = null ) { * @return bool Whether a token was parsed. */ public function next_token() { + return $this->internal_next_token(); + } + + private function internal_next_token() { $was_at = $this->bytes_already_parsed; $this->after_tag(); @@ -3189,15 +3193,7 @@ public function get_updated_html() { * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; - $this->parse_next_tag(); - // Reparse the attributes. - while ( $this->parse_next_attribute() ) { - continue; - } - - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - $this->token_length = $tag_ends_at - $this->token_starts_at; - $this->bytes_already_parsed = $tag_ends_at; + $this->internal_next_token(); return $this->html; } From 51afe4c8404d0a92790020ec699441eb49ce0602 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 8 Mar 2024 18:25:36 -0700 Subject: [PATCH 4/7] Rename internal method and add explanatory documentation. --- .../html-api/class-wp-html-tag-processor.php | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index f4bef9d7be0d6..9ad7c38c5c0cf 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -837,10 +837,25 @@ public function next_tag( $query = null ) { * @return bool Whether a token was parsed. */ public function next_token() { - return $this->internal_next_token(); + return $this->base_class_next_token(); } - private function internal_next_token() { + /** + * Internal method which finds the next token in the HTML document. + * + * This method is a protected internal function which implements the logic for + * finding the next token in a document. It's called during `get_updated_html()` + * so that the parser can update its state after applying updates, but without + * affecting the location of the cursor in the document or erroneously triggering + * higher-level logic in subclasses, such as in the HTML Processor. + * + * @since 6.5.0 + * + * @access private + * + * @return bool Whether a token was parsed. + */ + private function base_class_next_token() { $was_at = $this->bytes_already_parsed; $this->after_tag(); @@ -3193,7 +3208,7 @@ public function get_updated_html() { * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; - $this->internal_next_token(); + $this->base_class_next_token(); return $this->html; } From 805ea9a7d2e31a070ed895d37015209d09b29d62 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Mar 2024 08:18:23 -0700 Subject: [PATCH 5/7] Flush updates once reaching 1000 enqueued changes. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 9ad7c38c5c0cf..5c14b8d8484a7 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2067,6 +2067,9 @@ private function after_tag() { * need to be flushed to raw lexical updates. */ $this->class_name_updates_to_attributes_updates(); + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_html(); + } foreach ( $this->lexical_updates as $name => $update ) { /* * Any updates appearing after the cursor should be applied From 3b9b322cbebb03b195d9a81b95f116c7094f532d Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Mar 2024 16:07:29 -0700 Subject: [PATCH 6/7] Review feedback --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 8 ++++---- .../tests/html-api/wpHtmlTagProcessor-bookmark.php | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 5c14b8d8484a7..3d583cae56633 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -844,10 +844,10 @@ public function next_token() { * Internal method which finds the next token in the HTML document. * * This method is a protected internal function which implements the logic for - * finding the next token in a document. It's called during `get_updated_html()` - * so that the parser can update its state after applying updates, but without - * affecting the location of the cursor in the document or erroneously triggering - * higher-level logic in subclasses, such as in the HTML Processor. + * finding the next token in a document. It exists so that the parser can update + * its state without affecting the location of the cursor in the document and + * without triggering subclass methods for things like `next_token()`, e.g. when + * applying patches before searching for the next token. * * @since 6.5.0 * diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php index be4b6398751af..0c5093d03cc40 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php @@ -293,7 +293,7 @@ public function test_bookmarks_complex_use_case() { /** * @ticket 56299 - * @ticket 60697 Added additional assertion to verify that a requested update was properly applied. + * @ticket 60697 * * @covers WP_HTML_Tag_Processor::seek */ From 0ec643683d111420b0711f441c397de881bc4cfc Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Mar 2024 16:12:29 -0700 Subject: [PATCH 7/7] Expand explanation. Co-authored-by: Weston Ruter --- .../html-api/class-wp-html-tag-processor.php | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 3d583cae56633..c540ea96c111e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2067,9 +2067,20 @@ private function after_tag() { * need to be flushed to raw lexical updates. */ $this->class_name_updates_to_attributes_updates(); + + /* + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practially-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. + */ if ( 1000 < count( $this->lexical_updates ) ) { $this->get_updated_html(); } + foreach ( $this->lexical_updates as $name => $update ) { /* * Any updates appearing after the cursor should be applied