From be3e30f57c8bb5545aa2bb1b796c386637f46100 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 16 Jul 2024 21:23:02 +0200 Subject: [PATCH 1/4] Implement step_in_column_group --- .../html-api/class-wp-html-processor.php | 90 ++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 51802ac558a60..2051657b1a2fe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3068,7 +3068,7 @@ private function step_in_caption(): bool { * This internal function performs the 'in column group' insertion mode * logic for the generalized WP_HTML_Processor::step() function. * - * @since 6.7.0 Stub implementation. + * @since 6.7.0 * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * @@ -3078,7 +3078,93 @@ private function step_in_caption(): bool { * @return bool Whether an element was found. */ private function step_in_column_group(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP . ' state.' ); + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + */ + case '#text': + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + if ( + strspn( $this->html, "\u{0009}\u{000A}\u{000C}\u{000D}\u{0020}", $current_token->start, $current_token->length ) === $current_token->length + ) { + $this->insert_html_element( $this->state->current_token ); + return true; + } + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // @todo Indicate a parse error once it's possible. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "col" + */ + case '+COL': + $this->insert_html_element( $this->state->current_token ); + $this->state->stack_of_open_elements->pop(); + return true; + + /* + * > An end tag whose tag name is "colgroup" + */ + case '-COLGROUP': + if ( ! $this->state->stack_of_open_elements->current_node_is( 'COLGROUP' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return true; + + /* + * > An end tag whose tag name is "col" + */ + case '-COL': + // @todo Indicate a parse error once it's possible. + return $this->step(); + + /* + * > A start tag whose tag name is "template" + * > An end tag whose tag name is "template" + */ + case '+TEMPLATE': + case '-TEMPLATE': + return $this->step_in_head(); + } + + /* + * > Anything else + */ + if ( ! $this->state->stack_of_open_elements->current_node_is( 'COLGROUP' ) ) { + // @todo Indicate a parse error once it's possible. + return $this->step(); + } + $this->state->stack_of_open_elements->pop(); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** From 5dd41ffe7f29a87ac2d4a741604fa23eabf7e8bf Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 16 Jul 2024 22:43:54 +0200 Subject: [PATCH 2/4] Better text handling in column group --- src/wp-includes/html-api/class-wp-html-processor.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 2051657b1a2fe..071b9125df7dc 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3090,7 +3090,8 @@ private function step_in_column_group(): bool { case '#text': $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; if ( - strspn( $this->html, "\u{0009}\u{000A}\u{000C}\u{000D}\u{0020}", $current_token->start, $current_token->length ) === $current_token->length + // U+0000 (NULL) Is not mentiond but is included here. Null bytes should be completely ignored. + strspn( $this->html, "\u{0000}\u{0009}\u{000A}\u{000C}\u{000D}\u{0020}", $current_token->start, $current_token->length ) === $current_token->length ) { $this->insert_html_element( $this->state->current_token ); return true; From bbcae26029abc71f58490f3e44ae9f68ad8b7a8b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 31 Jul 2024 19:28:34 +0200 Subject: [PATCH 3/4] Use regular whitespace test checking --- .../html-api/class-wp-html-processor.php | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 071b9125df7dc..770c480cf788d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3088,14 +3088,22 @@ private function step_in_column_group(): bool { * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ case '#text': - $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; - if ( - // U+0000 (NULL) Is not mentiond but is included here. Null bytes should be completely ignored. - strspn( $this->html, "\u{0000}\u{0009}\u{000A}\u{000C}\u{000D}\u{0020}", $current_token->start, $current_token->length ) === $current_token->length - ) { + $text = $this->get_modifiable_text(); + if ( '' === $text ) { + /* + * If the text is empty after processing HTML entities and stripping + * U+0000 NULL bytes then ignore the token. + */ + return $this->step(); + } + + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + // Insert the character. $this->insert_html_element( $this->state->current_token ); return true; } + + goto in_column_group_anything_else; break; /* @@ -3156,6 +3164,7 @@ private function step_in_column_group(): bool { return $this->step_in_head(); } + in_column_group_anything_else: /* * > Anything else */ From f16854fd6f5897fbdc7c2cecc7b9927a396b3a8d Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Aug 2024 14:46:04 -0700 Subject: [PATCH 4/4] Adjust comments --- src/wp-includes/html-api/class-wp-html-processor.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 770c480cf788d..5ff2aa87ffdd6 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3085,7 +3085,8 @@ private function step_in_column_group(): bool { switch ( $op ) { /* - * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ case '#text': $text = $this->get_modifiable_text(); @@ -3152,7 +3153,7 @@ private function step_in_column_group(): bool { * > An end tag whose tag name is "col" */ case '-COL': - // @todo Indicate a parse error once it's possible. + // Parse error: ignore the token. return $this->step(); /*