HTML API: Backport updates from Core

Pulling over work on the HTML API from Core: - bug fix: stop parsing tags inside of STYLE, IFRAME, and other RAWTEXT elements. - bug fix: stay on track when document ends in the middle of processing a tag. - bug fix: remove all copies of an attribute when duplicates exist on a tag. - bug fix: stop processing in the HTML Processor when encountering unsupported markup - reverts changes made in #53866 which introduced a bug during a styling refactor. - update docblock comments through HTML API. This patch also excludes the HTML API folders in the compatability layer from WPCS inspection, since this code is maintained, reviewed, and accepted into Core and brought back into Gutenberg. Changes to the code need to be made first in Core and brought over as blessed backport changes. Co-authored-by: Ari Stathopoulos <aristath@gmail.com> Co-authored-by: Bernie Reiter <96308+ockham@users.noreply.github.com>
WordPress · Sep 27, 2023 · e1a88b3 · e1a88b3
1 parent e082f02
commit e1a88b3
Show file tree

Hide file tree

Showing 5 changed files with 2,626 additions and 68 deletions.
diff --git a/lib/compat/wordpress-6.3/html-api/class-gutenberg-html-tag-processor-6-3.php b/lib/compat/wordpress-6.3/html-api/class-gutenberg-html-tag-processor-6-3.php
@@ -242,6 +242,8 @@
  * unquoted values will appear in the output with double-quotes.
  *
  * @since 6.2.0
+ * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
+ * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
  */
 class Gutenberg_HTML_Tag_Processor_6_3 {
 	/**
@@ -404,6 +406,16 @@ class Gutenberg_HTML_Tag_Processor_6_3 {
 	 */
 	private $attributes = array();
 
+	/**
+	 * Tracks spans of duplicate attributes on a given tag, used for removing
+	 * all copies of an attribute when calling `remove_attribute()`.
+	 *
+	 * @since 6.3.2
+	 *
+	 * @var (WP_HTML_Span[])[]|null
+	 */
+	private $duplicate_attributes = null;
+
 	/**
 	 * Which class names to add or remove from a tag.
 	 *
@@ -546,6 +558,10 @@ public function next_tag( $query = null ) {
 			}
 
 			// Ensure that the tag closes before the end of the document.
+			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+				return false;
+			}
+
 			$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
 			if ( false === $tag_ends_at ) {
 				return false;
@@ -564,7 +580,14 @@ public function next_tag( $query = null ) {
 			 * of the tag name as a pre-check avoids a string allocation when it's not needed.
 			 */
 			$t = $this->html[ $this->tag_name_starts_at ];
-			if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) {
+			if (
+				! $this->is_closing_tag &&
+				(
+					'i' === $t || 'I' === $t ||
+					'n' === $t || 'N' === $t ||
+					's' === $t || 'S' === $t ||
+					't' === $t || 'T' === $t
+				) ) {
 				$tag_name = $this->get_tag();
 
 				if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
@@ -576,6 +599,25 @@ public function next_tag( $query = null ) {
 				) {
 					$this->bytes_already_parsed = strlen( $this->html );
 					return false;
+				} elseif (
+					(
+						'IFRAME' === $tag_name ||
+						'NOEMBED' === $tag_name ||
+						'NOFRAMES' === $tag_name ||
+						'NOSCRIPT' === $tag_name ||
+						'STYLE' === $tag_name
+					) &&
+					! $this->skip_rawtext( $tag_name )
+				) {
+					/*
+					 * "XMP" should be here too but its rules are more complicated and require the
+					 * complexity of the HTML Processor (it needs to close out any open P element,
+					 * meaning it can't be skipped here or else the HTML Processor will lose its
+					 * place). For now, it can be ignored as it's a rare HTML tag in practice and
+					 * any normative HTML should be using PRE instead.
+					 */
+					$this->bytes_already_parsed = strlen( $this->html );
+					return false;
 				}
 			}
 		} while ( $already_found < $this->sought_match_offset );
@@ -706,15 +748,33 @@ public function release_bookmark( $name ) {
 		return true;
 	}
 
+	/**
+	 * Skips contents of generic rawtext elements.
+	 *
+	 * @since 6.3.2
+	 *
+	 * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm
+	 *
+	 * @param string $tag_name The uppercase tag name which will close the RAWTEXT region.
+	 * @return bool Whether an end to the RAWTEXT region was found before the end of the document.
+	 */
+	private function skip_rawtext( $tag_name ) {
+		/*
+		 * These two functions distinguish themselves on whether character references are
+		 * decoded, and since functionality to read the inner markup isn't supported, it's
+		 * not necessary to implement these two functions separately.
+		 */
+		return $this->skip_rcdata( $tag_name );
+	}
 
 	/**
-	 * Skips contents of title and textarea tags.
+	 * Skips contents of RCDATA elements, namely title and textarea tags.
 	 *
 	 * @since 6.2.0
 	 *
 	 * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
 	 *
-	 * @param string $tag_name The lowercase tag name which will close the RCDATA region.
+	 * @param string $tag_name The uppercase tag name which will close the RCDATA region.
 	 * @return bool Whether an end to the RCDATA region was found before the end of the document.
 	 */
 	private function skip_rcdata( $tag_name ) {
@@ -947,7 +1007,7 @@ private function parse_next_tag() {
 
 			if ( '/' === $this->html[ $at + 1 ] ) {
 				$this->is_closing_tag = true;
-				++$at;
+				$at++;
 			} else {
 				$this->is_closing_tag = false;
 			}
@@ -1016,7 +1076,7 @@ private function parse_next_tag() {
 					 *
 					 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
 					 */
-					--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
+					$closer_at--; // Pre-increment inside condition below reduces risk of accidental infinite looping.
 					while ( ++$closer_at < strlen( $html ) ) {
 						$closer_at = strpos( $html, '--', $closer_at );
 						if ( false === $closer_at ) {
@@ -1097,7 +1157,7 @@ private function parse_next_tag() {
 			 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
 			 */
 			if ( '>' === $html[ $at + 1 ] ) {
-				++$at;
+				$at++;
 				continue;
 			}
 
@@ -1236,6 +1296,25 @@ private function parse_next_attribute() {
 				$attribute_end,
 				! $has_value
 			);
+
+			return true;
+		}
+
+		/*
+		 * Track the duplicate attributes so if we remove it, all disappear together.
+		 *
+		 * While `$this->duplicated_attributes` could always be stored as an `array()`,
+		 * which would simplify the logic here, storing a `null` and only allocating
+		 * an array when encountering duplicates avoids needless allocations in the
+		 * normative case of parsing tags with no duplicate attributes.
+		 */
+		$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end );
+		if ( null === $this->duplicate_attributes ) {
+			$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
+		} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
+			$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
+		} else {
+			$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
 		}
 
 		return true;
@@ -1257,11 +1336,12 @@ private function skip_whitespace() {
 	 */
 	private function after_tag() {
 		$this->get_updated_html();
-		$this->tag_name_starts_at = null;
-		$this->tag_name_length    = null;
-		$this->tag_ends_at        = null;
-		$this->is_closing_tag     = null;
-		$this->attributes         = array();
+		$this->tag_name_starts_at   = null;
+		$this->tag_name_length      = null;
+		$this->tag_ends_at          = null;
+		$this->is_closing_tag       = null;
+		$this->attributes           = array();
+		$this->duplicate_attributes = null;
 	}
 
 	/**
@@ -1739,7 +1819,7 @@ public function get_attribute( $name ) {
 	 * @param string $prefix Prefix of requested attribute names.
 	 * @return array|null List of attribute names, or `null` when no tag opener is matched.
 	 */
-	public function get_attribute_names_with_prefix( $prefix ) {
+	function get_attribute_names_with_prefix( $prefix ) {
 		if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
 			return null;
 		}
@@ -2030,6 +2110,17 @@ public function remove_attribute( $name ) {
 			''
 		);
 
+		// Removes any duplicated attributes if they were also present.
+		if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
+			foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$attribute_token->start,
+					$attribute_token->end,
+					''
+				);
+			}
+		}
+
 		return true;
 	}
 
@@ -2282,9 +2373,11 @@ private function matches() {
 			 * See https://html.spec.whatwg.org/#attributes-3
 			 * See https://html.spec.whatwg.org/#space-separated-tokens
 			 */
-			do {
-				$class_at = strpos( $this->html, $this->sought_class_name, $class_at );
-
+			while (
+				// phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
+				false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) &&
+				$class_at < $class_end
+			) {
 				/*
 				 * Verify this class starts at a boundary.
 				 */
@@ -2310,7 +2403,7 @@ private function matches() {
 				}
 
 				return true;
-			} while ( false !== $class_at && $class_at < $class_end );
+			}
 
 			return false;
 		}