From 279e43d33a7068993fd1362e5373b29cec17c28f Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 30 Nov 2023 13:47:56 -0600 Subject: [PATCH] HTML API: Track spans of text with (offset, length) instead of (start, end) This patch follows-up with earlier design questions around how to represent spans of strings inside the class. It's relevant now as preparation for #5683. The mixture of (offset, length) and (start, end) coordinates becomes confusing at times and all final string operations are performed with the (offset, length) pair, since these feed into `strlen()`. In preparation for exposing all tokens within an HTML document this change: - Unifies the representation throughout the class. - It creates `token_starts_at` to track the start of the current token. - It replaces `tag_ends_at` with `token_length` for re-use with other token types. There should be no functional or behavioral changes in this patch. For the internal helper classes this patch introduces breaking changes, but those classes are marked private and should not be used outside of the HTML API itself. --- .../class-wp-html-attribute-token.php | 38 +++++- .../html-api/class-wp-html-span.php | 19 +-- .../html-api/class-wp-html-tag-processor.php | 119 +++++++++++------- .../class-wp-html-text-replacement.php | 21 ++-- 4 files changed, 130 insertions(+), 67 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-attribute-token.php b/src/wp-includes/html-api/class-wp-html-attribute-token.php index f938609e41687..74d41320b1c79 100644 --- a/src/wp-includes/html-api/class-wp-html-attribute-token.php +++ b/src/wp-includes/html-api/class-wp-html-attribute-token.php @@ -15,6 +15,7 @@ * * @access private * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -23,6 +24,7 @@ class WP_HTML_Attribute_Token { * Attribute name. * * @since 6.2.0 + * * @var string */ public $name; @@ -31,6 +33,7 @@ class WP_HTML_Attribute_Token { * Attribute value. * * @since 6.2.0 + * * @var int */ public $value_starts_at; @@ -39,6 +42,7 @@ class WP_HTML_Attribute_Token { * How many bytes the value occupies in the input HTML. * * @since 6.2.0 + * * @var int */ public $value_length; @@ -47,22 +51,43 @@ class WP_HTML_Attribute_Token { * The string offset where the attribute name starts. * * @since 6.2.0 + * * @var int */ public $start; /** - * The string offset after the attribute value or its name. + * Byte length of text spanning the attribute inside a tag. + * + * This span starts at the first character of the attribute name + * and it ends after one of three cases: + * + * - at the end of the attribute name for boolean attributes. + * - at the end of the value for unquoted attributes. + * - at the final single or double quote for quoted attributes. + * + * Example: + * + *
+ * ------------ length is 12, including quotes + * + * + * ------- length is 6 + * + * + * ------------ length is 11 + * + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * - * @since 6.2.0 * @var int */ - public $end; + public $length; /** * Whether the attribute is a boolean attribute with value `true`. * * @since 6.2.0 + * * @var bool */ public $is_true; @@ -71,20 +96,21 @@ class WP_HTML_Attribute_Token { * Constructor. * * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * * @param string $name Attribute name. * @param int $value_start Attribute value. * @param int $value_length Number of bytes attribute value spans. * @param int $start The string offset where the attribute name starts. - * @param int $end The string offset after the attribute value or its name. + * @param int $length Byte length of the entire attribute name or name and value pair expression. * @param bool $is_true Whether the attribute is a boolean attribute with true value. */ - public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) { + public function __construct( $name, $value_start, $value_length, $start, $length, $is_true ) { $this->name = $name; $this->value_starts_at = $value_start; $this->value_length = $value_length; $this->start = $start; - $this->end = $end; + $this->length = $length; $this->is_true = $is_true; } } diff --git a/src/wp-includes/html-api/class-wp-html-span.php b/src/wp-includes/html-api/class-wp-html-span.php index 46227ebd02997..b1ab865af3bed 100644 --- a/src/wp-includes/html-api/class-wp-html-span.php +++ b/src/wp-includes/html-api/class-wp-html-span.php @@ -18,6 +18,7 @@ * * @access private * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -26,28 +27,30 @@ class WP_HTML_Span { * Byte offset into document where span begins. * * @since 6.2.0 + * * @var int */ public $start; /** - * Byte offset into document where span ends. + * Byte length of this span. + * + * @since 6.5.0 * - * @since 6.2.0 * @var int */ - public $end; + public $length; /** * Constructor. * * @since 6.2.0 * - * @param int $start Byte offset into document where replacement span begins. - * @param int $end Byte offset into document where replacement span ends. + * @param int $start Byte offset into document where replacement span begins. + * @param int $length Byte length of span. */ - public function __construct( $start, $end ) { - $this->start = $start; - $this->end = $end; + public function __construct( $start, $length ) { + $this->start = $start; + $this->length = $length; } } diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ea74410230954..17b3f400fcea6 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -329,47 +329,68 @@ class WP_HTML_Tag_Processor { private $bytes_already_parsed = 0; /** - * Byte offset in input document where current tag name starts. + * Byte offset in input document where current token starts. * * Example: * *
... * 01234 - * - tag name starts at 1 + * - token starts at 0 + * + * @since 6.5.0 * - * @since 6.2.0 * @var int|null */ - private $tag_name_starts_at; + private $token_starts_at; /** - * Byte length of current tag name. + * Byte length of current token. + * + * Example: + * + *
... + * 012345678901234 + * - token length is 14 - 0 = 14 + * + * a is a token. + * 0123456789 123456789 123456789 + * - token length is 17 - 2 = 15 + * + * @since 6.5.0 + * + * @var int|null + */ + private $token_length; + + /** + * Byte offset in input document where current tag name starts. * * Example: * *
... * 01234 - * --- tag name length is 3 + * - tag name starts at 1 * * @since 6.2.0 + * * @var int|null */ - private $tag_name_length; + private $tag_name_starts_at; /** - * Byte offset in input document where current tag token ends. + * Byte length of current tag name. * * Example: * *
... - * 0 1 | - * 01234567890123456 - * --- tag name ends at 14 + * 01234 + * --- tag name length is 3 * * @since 6.2.0 + * * @var int|null */ - private $tag_ends_at; + private $tag_name_length; /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. @@ -388,14 +409,14 @@ class WP_HTML_Tag_Processor { * //
* // ^ parsing will continue from this point. * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ) * ); * * // When picking up parsing again, or when asking to find the * // `class` attribute we will continue and add to this array. * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), - * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ), + * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false ) * ); * * // Note that only the `class` attribute value is stored in the index. @@ -484,9 +505,9 @@ class WP_HTML_Tag_Processor { * * // Replace an attribute stored with a new value, indices * // sourced from the lazily-parsed HTML recognizer. - * $start = $attributes['src']->start; - * $end = $attributes['src']->end; - * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value ); + * $start = $attributes['src']->start; + * $length = $attributes['src']->length; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value ); * * // Correspondingly, something like this will appear in this array. * $lexical_updates = array( @@ -566,7 +587,7 @@ public function next_tag( $query = null ) { if ( false === $tag_ends_at ) { return false; } - $this->tag_ends_at = $tag_ends_at; + $this->token_length = $tag_ends_at - $this->token_starts_at; $this->bytes_already_parsed = $tag_ends_at; // Finally, check if the parsed tag and its attributes match the search query. @@ -808,10 +829,7 @@ public function set_bookmark( $name ) { return false; } - $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), - $this->tag_ends_at - ); + $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); return true; } @@ -875,7 +893,7 @@ private function skip_rcdata( $tag_name ) { while ( false !== $at && $at < $doc_length ) { $at = strpos( $this->html, '= $doc_length ) { $this->bytes_already_parsed = $doc_length; return false; @@ -1093,6 +1111,8 @@ private function parse_next_tag() { return false; } + $this->token_starts_at = $at; + if ( '/' === $this->html[ $at + 1 ] ) { $this->is_closing_tag = true; ++$at; @@ -1381,7 +1401,7 @@ private function parse_next_attribute() { $value_start, $value_length, $attribute_start, - $attribute_end, + $attribute_end - $attribute_start, ! $has_value ); @@ -1396,7 +1416,7 @@ private function parse_next_attribute() { * an array when encountering duplicates avoids needless allocations in the * normative case of parsing tags with no duplicate attributes. */ - $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end ); + $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start ); if ( null === $this->duplicate_attributes ) { $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { @@ -1424,9 +1444,10 @@ private function skip_whitespace() { */ private function after_tag() { $this->get_updated_html(); + $this->token_starts_at = null; + $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; - $this->tag_ends_at = null; $this->is_closing_tag = null; $this->attributes = array(); $this->duplicate_attributes = null; @@ -1606,7 +1627,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { $bytes_already_copied = 0; $output_buffer = ''; foreach ( $this->lexical_updates as $diff ) { - $shift = strlen( $diff->text ) - ( $diff->end - $diff->start ); + $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. if ( $diff->start <= $this->bytes_already_parsed ) { @@ -1620,7 +1641,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); $output_buffer .= $diff->text; - $bytes_already_copied = $diff->end; + $bytes_already_copied = $diff->start + $diff->length; } $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); @@ -1630,6 +1651,8 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { + $bookmark_end = $bookmark->start + $bookmark->length; + /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change @@ -1640,28 +1663,30 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { - if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { + $diff_end = $diff->start + $diff->length; + + if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { break; } - if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { $this->release_bookmark( $bookmark_name ); continue 2; } - $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); + $delta = strlen( $diff->text ) - $diff->length; if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } - if ( $bookmark->end >= $diff->end ) { + if ( $bookmark_end >= $diff_end ) { $tail_delta += $delta; } } - $bookmark->start += $head_delta; - $bookmark->end += $tail_delta; + $bookmark->start += $head_delta; + $bookmark->length += $tail_delta - $head_delta; } $this->lexical_updates = array(); @@ -1743,7 +1768,7 @@ private static function sort_start_ascending( $a, $b ) { * This code should be unreachable, because it implies the two replacements * start at the same location and contain the same text. */ - return $a->end - $b->end; + return $a->length - $b->length; } /** @@ -1971,7 +1996,15 @@ public function has_self_closing_flag() { return false; } - return '/' === $this->html[ $this->tag_ends_at - 1 ]; + /* + * The self-closing flag is the solidus at the _end_ of the tag, not the beginning. + * + * Example: + * + *
+ * ^ this appears one character before the end of the closing ">". + */ + return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ]; } /** @@ -2101,7 +2134,7 @@ public function set_attribute( $name, $value ) { $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, - $existing_attribute->end, + $existing_attribute->length, $updated_attribute ); } else { @@ -2119,7 +2152,7 @@ public function set_attribute( $name, $value ) { */ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, - $this->tag_name_starts_at + $this->tag_name_length, + 0, ' ' . $updated_attribute ); } @@ -2194,7 +2227,7 @@ public function remove_attribute( $name ) { */ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, - $this->attributes[ $name ]->end, + $this->attributes[ $name ]->length, '' ); @@ -2203,7 +2236,7 @@ public function remove_attribute( $name ) { foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, - $attribute_token->end, + $attribute_token->length, '' ); } @@ -2289,7 +2322,7 @@ public function get_updated_html() { * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ - $before_current_tag = $this->tag_name_starts_at - 1; + $before_current_tag = $this->token_starts_at; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. @@ -2325,7 +2358,7 @@ public function get_updated_html() { } $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - $this->tag_ends_at = $tag_ends_at; + $this->token_length = $tag_ends_at - $this->token_starts_at; $this->bytes_already_parsed = $tag_ends_at; return $this->html; diff --git a/src/wp-includes/html-api/class-wp-html-text-replacement.php b/src/wp-includes/html-api/class-wp-html-text-replacement.php index 26b7bb2d28630..d66eb2aebacd8 100644 --- a/src/wp-includes/html-api/class-wp-html-text-replacement.php +++ b/src/wp-includes/html-api/class-wp-html-text-replacement.php @@ -15,6 +15,7 @@ * * @access private * @since 6.2.0 + * @since {WP_VERSION} Replace `end` with `length` to more closely match `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -28,12 +29,12 @@ class WP_HTML_Text_Replacement { public $start; /** - * Byte offset into document where replacement span ends. + * Byte length of span being replaced. * - * @since 6.2.0 + * @since {WP_VERSION} * @var int */ - public $end; + public $length; /** * Span of text to insert in document to replace existing content from start to end. @@ -48,13 +49,13 @@ class WP_HTML_Text_Replacement { * * @since 6.2.0 * - * @param int $start Byte offset into document where replacement span begins. - * @param int $end Byte offset into document where replacement span ends. - * @param string $text Span of text to insert in document to replace existing content from start to end. + * @param int $start Byte offset into document where replacement span begins. + * @param int $length Byte length of span in document being replaced. + * @param string $text Span of text to insert in document to replace existing content from start to end. */ - public function __construct( $start, $end, $text ) { - $this->start = $start; - $this->end = $end; - $this->text = $text; + public function __construct( $start, $length, $text ) { + $this->start = $start; + $this->length = $length; + $this->text = $text; } }