Skip to content

Commit

Permalink
HTML API: Optimize low-level parsing details in Tag Processor.
Browse files Browse the repository at this point in the history
Introduces a number of micro-level optimizations in the Tag Processor to
improve token-scanning performance. Should contain no functional changes.

Based on benchmarking against a list of the 100 most-visited websites,
these changes result in an average improvement in performance of the Tag
Processor for scanning tags from between 3.5% and 7.5%.

Developed in #6890
Discussed in https://core.trac.wordpress.org/ticket/61545

Follow-up to [55203].

See #61545.


git-svn-id: https://develop.svn.wordpress.org/trunk@58613 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Jul 1, 2024
1 parent 80409a2 commit cf064ef
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 92 deletions.
26 changes: 13 additions & 13 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public static function decode( $context, $text ) {

while ( $at < $end ) {
$next_character_reference_at = strpos( $text, '&', $at );
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
if ( false === $next_character_reference_at ) {
break;
}

Expand Down Expand Up @@ -436,26 +436,26 @@ public static function code_point_to_utf8_bytes( $code_point ) {
}

if ( $code_point <= 0x7FF ) {
$byte1 = ( $code_point >> 6 ) | 0xC0;
$byte2 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
$byte2 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CC', $byte1, $byte2 );
return "{$byte1}{$byte2}";
}

if ( $code_point <= 0xFFFF ) {
$byte1 = ( $code_point >> 12 ) | 0xE0;
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte3 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CCC', $byte1, $byte2, $byte3 );
return "{$byte1}{$byte2}{$byte3}";
}

// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = ( $code_point >> 18 ) | 0xF0;
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte4 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
return "{$byte1}{$byte2}{$byte3}{$byte4}";
}
}
118 changes: 39 additions & 79 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1524,21 +1524,10 @@ private function parse_next_tag() {
$was_at = $this->bytes_already_parsed;
$at = $was_at;

while ( false !== $at && $at < $doc_length ) {
while ( $at < $doc_length ) {
$at = strpos( $html, '<', $at );

/*
* This does not imply an incomplete parse; it indicates that there
* can be nothing left in the document other than a #text node.
*/
if ( false === $at ) {
$this->parser_state = self::STATE_TEXT_NODE;
$this->token_starts_at = $was_at;
$this->token_length = strlen( $html ) - $was_at;
$this->text_starts_at = $was_at;
$this->text_length = $this->token_length;
$this->bytes_already_parsed = strlen( $html );
return true;
break;
}

if ( $at > $was_at ) {
Expand All @@ -1554,19 +1543,9 @@ private function parse_next_tag() {
*
* @see https://html.spec.whatwg.org/#tag-open-state
*/
if ( strlen( $html ) > $at + 1 ) {
$next_character = $html[ $at + 1 ];
$at_another_node = (
'!' === $next_character ||
'/' === $next_character ||
'?' === $next_character ||
( 'A' <= $next_character && $next_character <= 'Z' ) ||
( 'a' <= $next_character && $next_character <= 'z' )
);
if ( ! $at_another_node ) {
++$at;
continue;
}
if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
++$at;
continue;
}

$this->parser_state = self::STATE_TEXT_NODE;
Expand Down Expand Up @@ -1630,11 +1609,7 @@ private function parse_next_tag() {
* `<!--` transitions to a comment state – apply further comment rules.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
$doc_length > $at + 3 &&
'-' === $html[ $at + 2 ] &&
'-' === $html[ $at + 3 ]
) {
if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
$closer_at = $at + 4;
// If it's not possible to close the comment then there is nothing more to scan.
if ( $doc_length <= $closer_at ) {
Expand Down Expand Up @@ -1911,7 +1886,17 @@ private function parse_next_tag() {
++$at;
}

return false;
/*
* This does not imply an incomplete parse; it indicates that there
* can be nothing left in the document other than a #text node.
*/
$this->parser_state = self::STATE_TEXT_NODE;
$this->token_starts_at = $was_at;
$this->token_length = $doc_length - $was_at;
$this->text_starts_at = $was_at;
$this->text_length = $this->token_length;
$this->bytes_already_parsed = $doc_length;
return true;
}

/**
Expand All @@ -1922,9 +1907,11 @@ private function parse_next_tag() {
* @return bool Whether an attribute was found before the end of the document.
*/
private function parse_next_attribute() {
$doc_length = strlen( $this->html );

// Skip whitespace and slashes.
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1941,21 +1928,21 @@ private function parse_next_attribute() {
: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );

// No attribute, just tag closer.
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
return false;
}

$attribute_start = $this->bytes_already_parsed;
$attribute_name = substr( $this->html, $attribute_start, $name_length );
$this->bytes_already_parsed += $name_length;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}

$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1965,7 +1952,7 @@ private function parse_next_attribute() {
if ( $has_value ) {
++$this->bytes_already_parsed;
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1976,8 +1963,10 @@ private function parse_next_attribute() {
case '"':
$quote = $this->html[ $this->bytes_already_parsed ];
$value_start = $this->bytes_already_parsed + 1;
$value_length = strcspn( $this->html, $quote, $value_start );
$attribute_end = $value_start + $value_length + 1;
$end_quote_at = strpos( $this->html, $quote, $value_start );
$end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at;
$value_length = $end_quote_at - $value_start;
$attribute_end = $end_quote_at + 1;
$this->bytes_already_parsed = $attribute_end;
break;

Expand All @@ -1993,7 +1982,7 @@ private function parse_next_attribute() {
$attribute_end = $attribute_start + $name_length;
}

if ( $attribute_end >= strlen( $this->html ) ) {
if ( $attribute_end >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -2014,7 +2003,7 @@ private function parse_next_attribute() {
$comparable_name = strtolower( $attribute_name );

// If an attribute is listed many times, only use the first declaration and ignore the rest.
if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
$attribute_name,
$value_start,
Expand All @@ -2038,7 +2027,7 @@ private function parse_next_attribute() {
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
} elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
} else {
$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
Expand Down Expand Up @@ -3110,14 +3099,12 @@ public function remove_attribute( $name ) {
);

// Removes any duplicated attributes if they were also present.
if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->length,
''
);
}
foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->length,
''
);
}

return true;
Expand Down Expand Up @@ -3317,35 +3304,8 @@ private function matches() {
}

// Does the tag name match the requested tag name in a case-insensitive manner?
if ( null !== $this->sought_tag_name ) {
/*
* String (byte) length lookup is fast. If they aren't the
* same length then they can't be the same string values.
*/
if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
return false;
}

/*
* Check each character to determine if they are the same.
* Defer calls to `strtoupper()` to avoid them when possible.
* Calling `strcasecmp()` here tested slowed than comparing each
* character, so unless benchmarks show otherwise, it should
* not be used.
*
* It's expected that most of the time that this runs, a
* lower-case tag name will be supplied and the input will
* contain lower-case tag names, thus normally bypassing
* the case comparison code.
*/
for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
$html_char = $this->html[ $this->tag_name_starts_at + $i ];
$tag_char = $this->sought_tag_name[ $i ];

if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
return false;
}
}
if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
return false;
}

if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
Expand Down

0 comments on commit cf064ef

Please sign in to comment.