Skip to content

Commit

Permalink
HTML API: Backport updates from Core
Browse files Browse the repository at this point in the history
Pulling over work on the HTML API from Core:

 - bug fix: stop parsing tags inside of STYLE, IFRAME, and other RAWTEXT elements.
 - bug fix: stay on track when document ends in the middle of processing a tag.
 - bug fix: remove all copies of an attribute when duplicates exist on a tag.
 - bug fix: stop processing in the HTML Processor when encountering unsupported markup
 - reverts changes made in #53866 which introduced a bug during a styling refactor.
 - update docblock comments through HTML API.

This patch also excludes the HTML API folders in the compatability
layer from WPCS inspection, since this code is maintained, reviewed,
and accepted into Core and brought back into Gutenberg. Changes to
the code need to be made first in Core and brought over as blessed
backport changes.

Co-authored-by: Ari Stathopoulos <aristath@gmail.com>
Co-authored-by: Bernie Reiter <96308+ockham@users.noreply.github.com>
  • Loading branch information
3 people committed Sep 27, 2023
1 parent e082f02 commit e1a88b3
Show file tree
Hide file tree
Showing 5 changed files with 2,626 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@
* unquoted values will appear in the output with double-quotes.
*
* @since 6.2.0
* @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
* @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
*/
class Gutenberg_HTML_Tag_Processor_6_3 {
/**
Expand Down Expand Up @@ -404,6 +406,16 @@ class Gutenberg_HTML_Tag_Processor_6_3 {
*/
private $attributes = array();

/**
* Tracks spans of duplicate attributes on a given tag, used for removing
* all copies of an attribute when calling `remove_attribute()`.
*
* @since 6.3.2
*
* @var (WP_HTML_Span[])[]|null
*/
private $duplicate_attributes = null;

/**
* Which class names to add or remove from a tag.
*
Expand Down Expand Up @@ -546,6 +558,10 @@ public function next_tag( $query = null ) {
}

// Ensure that the tag closes before the end of the document.
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
return false;
}

$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
return false;
Expand All @@ -564,7 +580,14 @@ public function next_tag( $query = null ) {
* of the tag name as a pre-check avoids a string allocation when it's not needed.
*/
$t = $this->html[ $this->tag_name_starts_at ];
if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) {
if (
! $this->is_closing_tag &&
(
'i' === $t || 'I' === $t ||
'n' === $t || 'N' === $t ||
's' === $t || 'S' === $t ||
't' === $t || 'T' === $t
) ) {
$tag_name = $this->get_tag();

if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
Expand All @@ -576,6 +599,25 @@ public function next_tag( $query = null ) {
) {
$this->bytes_already_parsed = strlen( $this->html );
return false;
} elseif (
(
'IFRAME' === $tag_name ||
'NOEMBED' === $tag_name ||
'NOFRAMES' === $tag_name ||
'NOSCRIPT' === $tag_name ||
'STYLE' === $tag_name
) &&
! $this->skip_rawtext( $tag_name )
) {
/*
* "XMP" should be here too but its rules are more complicated and require the
* complexity of the HTML Processor (it needs to close out any open P element,
* meaning it can't be skipped here or else the HTML Processor will lose its
* place). For now, it can be ignored as it's a rare HTML tag in practice and
* any normative HTML should be using PRE instead.
*/
$this->bytes_already_parsed = strlen( $this->html );
return false;
}
}
} while ( $already_found < $this->sought_match_offset );
Expand Down Expand Up @@ -706,15 +748,33 @@ public function release_bookmark( $name ) {
return true;
}

/**
* Skips contents of generic rawtext elements.
*
* @since 6.3.2
*
* @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
*
* @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
* @return bool Whether an end to the RAWTEXT region was found before the end of the document.
*/
private function skip_rawtext( $tag_name ) {
/*
* These two functions distinguish themselves on whether character references are
* decoded, and since functionality to read the inner markup isn't supported, it's
* not necessary to implement these two functions separately.
*/
return $this->skip_rcdata( $tag_name );
}

/**
* Skips contents of title and textarea tags.
* Skips contents of RCDATA elements, namely title and textarea tags.
*
* @since 6.2.0
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
*
* @param string $tag_name The lowercase tag name which will close the RCDATA region.
* @param string $tag_name The uppercase tag name which will close the RCDATA region.
* @return bool Whether an end to the RCDATA region was found before the end of the document.
*/
private function skip_rcdata( $tag_name ) {
Expand Down Expand Up @@ -947,7 +1007,7 @@ private function parse_next_tag() {

if ( '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
++$at;
$at++;
} else {
$this->is_closing_tag = false;
}
Expand Down Expand Up @@ -1016,7 +1076,7 @@ private function parse_next_tag() {
*
* See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
*/
--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
$closer_at--; // Pre-increment inside condition below reduces risk of accidental infinite looping.
while ( ++$closer_at < strlen( $html ) ) {
$closer_at = strpos( $html, '--', $closer_at );
if ( false === $closer_at ) {
Expand Down Expand Up @@ -1097,7 +1157,7 @@ private function parse_next_tag() {
* See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
*/
if ( '>' === $html[ $at + 1 ] ) {
++$at;
$at++;
continue;
}

Expand Down Expand Up @@ -1236,6 +1296,25 @@ private function parse_next_attribute() {
$attribute_end,
! $has_value
);

return true;
}

/*
* Track the duplicate attributes so if we remove it, all disappear together.
*
* While `$this->duplicated_attributes` could always be stored as an `array()`,
* which would simplify the logic here, storing a `null` and only allocating
* an array when encountering duplicates avoids needless allocations in the
* normative case of parsing tags with no duplicate attributes.
*/
$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
} else {
$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
}

return true;
Expand All @@ -1257,11 +1336,12 @@ private function skip_whitespace() {
*/
private function after_tag() {
$this->get_updated_html();
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
$this->tag_ends_at = null;
$this->is_closing_tag = null;
$this->attributes = array();
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
$this->tag_ends_at = null;
$this->is_closing_tag = null;
$this->attributes = array();
$this->duplicate_attributes = null;
}

/**
Expand Down Expand Up @@ -1739,7 +1819,7 @@ public function get_attribute( $name ) {
* @param string $prefix Prefix of requested attribute names.
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ) {
function get_attribute_names_with_prefix( $prefix ) {
if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
return null;
}
Expand Down Expand Up @@ -2030,6 +2110,17 @@ public function remove_attribute( $name ) {
''
);

// Removes any duplicated attributes if they were also present.
if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
$attribute_token->end,
''
);
}
}

return true;
}

Expand Down Expand Up @@ -2282,9 +2373,11 @@ private function matches() {
* See https://html.spec.whatwg.org/#attributes-3
* See https://html.spec.whatwg.org/#space-separated-tokens
*/
do {
$class_at = strpos( $this->html, $this->sought_class_name, $class_at );

while (
// phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) &&
$class_at < $class_end
) {
/*
* Verify this class starts at a boundary.
*/
Expand All @@ -2310,7 +2403,7 @@ private function matches() {
}

return true;
} while ( false !== $class_at && $class_at < $class_end );
}

return false;
}
Expand Down
Loading

0 comments on commit e1a88b3

Please sign in to comment.