Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML API: Optimize low-level parsing details. #6890

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public static function decode( $context, $text ) {

while ( $at < $end ) {
$next_character_reference_at = strpos( $text, '&', $at );
if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
if ( false === $next_character_reference_at ) {
break;
}

Expand Down Expand Up @@ -436,26 +436,26 @@ public static function code_point_to_utf8_bytes( $code_point ) {
}

if ( $code_point <= 0x7FF ) {
$byte1 = ( $code_point >> 6 ) | 0xC0;
$byte2 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
$byte2 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CC', $byte1, $byte2 );
return "{$byte1}{$byte2}";
}

if ( $code_point <= 0xFFFF ) {
$byte1 = ( $code_point >> 12 ) | 0xE0;
$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte3 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CCC', $byte1, $byte2, $byte3 );
return "{$byte1}{$byte2}{$byte3}";
}

// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = ( $code_point >> 18 ) | 0xF0;
$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
$byte4 = $code_point & 0x3F | 0x80;
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $code_point & 0x3F | 0x80 );

return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
return "{$byte1}{$byte2}{$byte3}{$byte4}";
}
}
118 changes: 39 additions & 79 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1524,21 +1524,10 @@ private function parse_next_tag() {
$was_at = $this->bytes_already_parsed;
$at = $was_at;

while ( false !== $at && $at < $doc_length ) {
while ( $at < $doc_length ) {
$at = strpos( $html, '<', $at );

/*
* This does not imply an incomplete parse; it indicates that there
* can be nothing left in the document other than a #text node.
*/
if ( false === $at ) {
$this->parser_state = self::STATE_TEXT_NODE;
$this->token_starts_at = $was_at;
$this->token_length = strlen( $html ) - $was_at;
$this->text_starts_at = $was_at;
$this->text_length = $this->token_length;
$this->bytes_already_parsed = strlen( $html );
return true;
break;
}

if ( $at > $was_at ) {
Expand All @@ -1554,19 +1543,9 @@ private function parse_next_tag() {
*
* @see https://html.spec.whatwg.org/#tag-open-state
*/
if ( strlen( $html ) > $at + 1 ) {
$next_character = $html[ $at + 1 ];
$at_another_node = (
'!' === $next_character ||
'/' === $next_character ||
'?' === $next_character ||
( 'A' <= $next_character && $next_character <= 'Z' ) ||
( 'a' <= $next_character && $next_character <= 'z' )
);
if ( ! $at_another_node ) {
++$at;
continue;
}
if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @adamziel for pointing out to me that strspn() and strcspn() have the $length parameter!

++$at;
continue;
}

$this->parser_state = self::STATE_TEXT_NODE;
Expand Down Expand Up @@ -1630,11 +1609,7 @@ private function parse_next_tag() {
* `<!--` transitions to a comment state – apply further comment rules.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
$doc_length > $at + 3 &&
'-' === $html[ $at + 2 ] &&
'-' === $html[ $at + 3 ]
) {
if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
$closer_at = $at + 4;
// If it's not possible to close the comment then there is nothing more to scan.
if ( $doc_length <= $closer_at ) {
Expand Down Expand Up @@ -1911,7 +1886,17 @@ private function parse_next_tag() {
++$at;
}

return false;
/*
* This does not imply an incomplete parse; it indicates that there
* can be nothing left in the document other than a #text node.
*/
$this->parser_state = self::STATE_TEXT_NODE;
$this->token_starts_at = $was_at;
$this->token_length = $doc_length - $was_at;
$this->text_starts_at = $was_at;
$this->text_length = $this->token_length;
$this->bytes_already_parsed = $doc_length;
return true;
}

/**
Expand All @@ -1922,9 +1907,11 @@ private function parse_next_tag() {
* @return bool Whether an attribute was found before the end of the document.
*/
private function parse_next_attribute() {
$doc_length = strlen( $this->html );

// Skip whitespace and slashes.
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1941,21 +1928,21 @@ private function parse_next_attribute() {
: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );

// No attribute, just tag closer.
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
return false;
}

$attribute_start = $this->bytes_already_parsed;
$attribute_name = substr( $this->html, $attribute_start, $name_length );
$this->bytes_already_parsed += $name_length;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}

$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1965,7 +1952,7 @@ private function parse_next_attribute() {
if ( $has_value ) {
++$this->bytes_already_parsed;
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -1976,8 +1963,10 @@ private function parse_next_attribute() {
case '"':
$quote = $this->html[ $this->bytes_already_parsed ];
$value_start = $this->bytes_already_parsed + 1;
$value_length = strcspn( $this->html, $quote, $value_start );
$attribute_end = $value_start + $value_length + 1;
$end_quote_at = strpos( $this->html, $quote, $value_start );
$end_quote_at = false === $end_quote_at ? $doc_length : $end_quote_at;
$value_length = $end_quote_at - $value_start;
$attribute_end = $end_quote_at + 1;
$this->bytes_already_parsed = $attribute_end;
break;

Expand All @@ -1993,7 +1982,7 @@ private function parse_next_attribute() {
$attribute_end = $attribute_start + $name_length;
}

if ( $attribute_end >= strlen( $this->html ) ) {
if ( $attribute_end >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
Expand All @@ -2014,7 +2003,7 @@ private function parse_next_attribute() {
$comparable_name = strtolower( $attribute_name );

// If an attribute is listed many times, only use the first declaration and ignore the rest.
if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
$attribute_name,
$value_start,
Expand All @@ -2038,7 +2027,7 @@ private function parse_next_attribute() {
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
} elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
} else {
$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
Expand Down Expand Up @@ -3110,14 +3099,12 @@ public function remove_attribute( $name ) {
);

// Removes any duplicated attributes if they were also present.
if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->length,
''
);
}
foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->length,
''
);
}

return true;
Expand Down Expand Up @@ -3317,35 +3304,8 @@ private function matches() {
}

// Does the tag name match the requested tag name in a case-insensitive manner?
if ( null !== $this->sought_tag_name ) {
/*
* String (byte) length lookup is fast. If they aren't the
* same length then they can't be the same string values.
*/
if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
return false;
}

/*
* Check each character to determine if they are the same.
* Defer calls to `strtoupper()` to avoid them when possible.
* Calling `strcasecmp()` here tested slowed than comparing each
* character, so unless benchmarks show otherwise, it should
* not be used.
*
* It's expected that most of the time that this runs, a
* lower-case tag name will be supplied and the input will
* contain lower-case tag names, thus normally bypassing
* the case comparison code.
*/
for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
$html_char = $this->html[ $this->tag_name_starts_at + $i ];
$tag_char = $this->sought_tag_name[ $i ];

if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
return false;
}
}
if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
return false;
}

if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
Expand Down
Loading