From bec7dc6a91b62c2d89ae1ec4bf44c0d65a783d0d Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 22 Aug 2023 11:51:02 -0600 Subject: [PATCH] HTML API: Allow extending input document for chunked processing. In some situations it may be useful to process an HTML document as it's being generated. This requires the ability to add chunks of text to the end of the input HTML. In this patch the `WP_HTML_Tag_Processor::extend_input( $html )` method is providing this ability. It is designed to allow for splitting an HTML at arbitrary boundaries. --- .../html-api/class-wp-html-processor.php | 6 ++- .../html-api/class-wp-html-tag-processor.php | 11 +++++ .../tests/html-api/wpHtmlProcessor.php | 48 +++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6e1723494c2e9..bf1802e930bfe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -438,11 +438,13 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->state->stack_of_open_elements->pop(); } - parent::next_tag( self::VISIT_EVERYTHING ); + $found_tag = parent::next_tag( self::VISIT_EVERYTHING ); + } else { + $found_tag = true; } // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( ! $found_tag ) { return false; } diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 2e84b3d7193a0..94f84c0fba915 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -326,6 +326,8 @@ class WP_HTML_Tag_Processor { */ private $bytes_already_parsed = 0; + private $previously_parsed = 0; + /** * Byte offset in input document where current tag name starts. * @@ -507,6 +509,14 @@ public function __construct( $html ) { $this->html = $html; } + public function extend_input( $html ) { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->bytes_already_parsed = $this->previously_parsed; + } + + $this->html .= $html; + } + /** * Finds the next tag matching the $query. * @@ -527,6 +537,7 @@ public function __construct( $html ) { public function next_tag( $query = null ) { $this->parse_query( $query ); $already_found = 0; + $this->previously_parsed = $this->bytes_already_parsed; do { if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 6a666c19cf27c..a32f0c189311a 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -115,4 +115,52 @@ public function test_fails_to_reconstruct_formatting_elements() { $this->assertTrue( $p->next_tag( 'EM' ), 'Could not find first EM.' ); $this->assertFalse( $p->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' ); } + + /** + * + * @ticket {TICKET NUMBER} + * + * @covers WP_HTML_Processor::extend_input + */ + public function test_continues_processing_after_extending_input() { + $p = WP_HTML_Processor::createFragment( '

This is' ); + + $p->next_tag( 'EM' ); + $this->assertFalse( $p->next_tag(), "Expected to reach end of document and pause, but found {$p->get_tag()} instad." ); + $this->assertNull( $p->get_last_error(), "Should not have encountered any errors, but found '{$p->get_last_error()}'." ); + + $p->extend_input( ' incomplete.

' ); + + // Find the strong. + $this->assertTrue( $p->next_tag() ); + $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'P', 'STRONG' ), $p->get_breadcrumbs() ); + + // Find the image. + $this->assertTrue( $p->next_tag() ); + $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'IMG' ), $p->get_breadcrumbs() ); + } + + /** + * + * @ticket {TICKET NUMBER} + * + * @covers WP_HTML_Processor::extend_input + */ + public function test_continues_processing_after_extending_truncated_input() { + $p = WP_HTML_Processor::createFragment( '

This is next_tag( 'EM' ); + $this->assertFalse( $p->next_tag(), "Expected to reach end of document and pause, but found {$p->get_tag()} instad." ); + $this->assertNull( $p->get_last_error(), "Should not have encountered any errors, but found '{$p->get_last_error()}'." ); + + $p->extend_input( 'ong>incomplete.

' ); + + // Find the strong. + $this->assertTrue( $p->next_tag() ); + $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'P', 'STRONG' ), $p->get_breadcrumbs() ); + + // Find the image. + $this->assertTrue( $p->next_tag() ); + $this->assertSame( array( 'HTML', 'BODY', 'DIV', 'IMG' ), $p->get_breadcrumbs() ); + } }